/* SPDX-License-Identifier: Apache-2.0 */ /** * Copyright 2026 Thorsten Töpper * * @file file_processor.c * * vim:ts=4:sw=4:expandtab */ #include #include #include #include #include #include #include /* https://docs.openssl.org/master/man3/EVP_DigestInit/ */ #include #include "file_processor.h" #include "trace_macros.h" #include "hex_conversion.h" #include "kv_manager.h" #include "database_interaction.h" #include "options.h" /*=========== DEFINES, CONSTANTS AND TYPES ===========*/ #ifndef BUFSIZE4MIB /* Personnal observation: dd bs=4M usually gets good performance regardless of SSD/HDD or USB 2.x/3.x */ #define BUFSIZE4MIB 4194304 #endif /** * Objects from the OpenSSL library * Improved later code readability by placing those together and manage outside the file handling code */ struct df_md_components { EVP_MD_CTX *mdctx_blake2; EVP_MD_CTX *mdctx_sha256; EVP_MD_CTX *mdctx_sha512; }; /*=========== GLOBAL VARIABLES ===========*/ /** TODO: Should be set via command line parameter, move to handling when implementing */ size_t glbl_bufsize = BUFSIZE4MIB; const EVP_MD *glbl_md_blake2 = NULL; const EVP_MD *glbl_md_sha256 = NULL; const EVP_MD *glbl_md_sha512 = NULL; /*=========== FUNCTIONS ===========*/ struct df_md_components *init_md_components(); void destroy_md_components(struct df_md_components *pkg); void print_fileinfo(struct df_fileinfo *info); /** * Prepare a df_md_components struct for active usage. */ inline struct df_md_components *init_md_components() { struct df_md_components *pkg = NULL; if ((pkg=calloc(1,sizeof(struct df_md_components))) == NULL) { LOGERR("ERROR: Failed to allocate heap memory for a struct df_md_components errno %d: %s\n", errno, strerror(errno)); return NULL; } /* TODO: research whether performance difference is really worth this additional complexity, * the util is processing files from storage as single thread, not tons of <1KiB blobs in parallel. * So a few nano- or microseconds are mostly insignificant compared to the I/O throttle. */ if (glbl_md_blake2 == NULL) { glbl_md_blake2 = EVP_blake2b512(); if (glbl_md_blake2 == NULL) { LOGERR("ERROR: Failed to fetch EVP_MD for BLAKE2\n"); return NULL; } } if (glbl_md_sha256 == NULL) { glbl_md_sha256 = EVP_sha256(); if (glbl_md_sha256 == NULL) { LOGERR("ERROR: Failed to fetch EVP_MD for SHA2_256\n"); return NULL; } } if (glbl_md_sha512 == NULL) { glbl_md_sha512 = EVP_sha512(); if (glbl_md_sha512 == NULL) { LOGERR("ERROR: Failed to fetch EVP_MD for SHA2_512\n"); return NULL; } } /* Create the contexts */ if ((pkg->mdctx_blake2 = EVP_MD_CTX_new()) == NULL) { LOGERR("ERROR: Failed to create context for BLAKE2\n"); free(pkg); return NULL; } if ((pkg->mdctx_sha256 = EVP_MD_CTX_new()) == NULL) { LOGERR("ERROR: Failed to create context for SHA256\n"); EVP_MD_CTX_free(pkg->mdctx_blake2); free(pkg); return NULL; } if ((pkg->mdctx_sha512 = EVP_MD_CTX_new()) == NULL) { LOGERR("ERROR: Failed to create context for SHA512\n"); EVP_MD_CTX_free(pkg->mdctx_blake2); EVP_MD_CTX_free(pkg->mdctx_sha256); free(pkg); return NULL; } /* Initialize them */ if (EVP_DigestInit_ex2(pkg->mdctx_blake2, glbl_md_blake2, NULL) != 1) { LOGERR("ERROR: Failed to initialize BLAKE2 context\n"); EVP_MD_CTX_free(pkg->mdctx_blake2); EVP_MD_CTX_free(pkg->mdctx_sha256); EVP_MD_CTX_free(pkg->mdctx_sha512); free(pkg); return NULL; } if (EVP_DigestInit_ex2(pkg->mdctx_sha256, glbl_md_sha256, NULL) != 1) { LOGERR("ERROR: Failed to initialize SHA256 context\n"); EVP_MD_CTX_free(pkg->mdctx_blake2); EVP_MD_CTX_free(pkg->mdctx_sha256); EVP_MD_CTX_free(pkg->mdctx_sha512); free(pkg); return NULL; } if (EVP_DigestInit_ex2(pkg->mdctx_sha512, glbl_md_sha512, NULL) != 1) { LOGERR("ERROR: Failed to initialize SHA512 context\n"); EVP_MD_CTX_free(pkg->mdctx_blake2); EVP_MD_CTX_free(pkg->mdctx_sha256); EVP_MD_CTX_free(pkg->mdctx_sha512); free(pkg); return NULL; } return pkg; } /** * Free all memory related to the given struct including itself * @param pkg the struct to destroy */ inline void destroy_md_components(struct df_md_components *pkg) { if (pkg == NULL) { return; } if (pkg->mdctx_blake2 != NULL) { EVP_MD_CTX_free(pkg->mdctx_blake2); } if (pkg->mdctx_sha256 != NULL) { EVP_MD_CTX_free(pkg->mdctx_sha256); } if (pkg->mdctx_sha512 != NULL) { EVP_MD_CTX_free(pkg->mdctx_sha512); } free(pkg); } /** * Read the file defined by path and name in the argument struct, and add stat * results and binary represented hashes of the file into the struct. * * @param info struct contains the path of the file to read, results will be * stored there. * * @return 1 if skipped * 0 on success * -1 on failure */ int process_file(struct df_fileinfo *info) { FILE *fdin = NULL; char fullpath[4096]; unsigned char buffer[glbl_bufsize]; size_t bytes_read; struct df_md_components *ctx_pkg; bool error_in_loop = false; struct df_fileinfo info_from_db; unsigned char md_val[EVP_MAX_MD_SIZE]; unsigned int md_len; if (info == NULL || info->name == NULL || info->path == NULL || info->name[0] == '\0' || info->path[0] == '\0') { LOGERR("ERROR: Not enough information to construct a full path.\n"); return -1; } if (snprintf(fullpath, 4096, "%s/%s", info->path, info->name) < 0) { LOGERR("ERROR: Failed to print fullpath string into stack memory: %s (errno %d)\n", strerror(errno), errno); return -1; } if (lstat(fullpath, &(info->statbuf)) == -1) { LOGERR("ERROR: lstat() call failed for file %s: %s (errno %d)\n", fullpath, strerror(errno), errno); return -1; } if ((info->statbuf.st_mode & S_IFMT) != S_IFREG) { LOGERR("ERROR: Non-regular files are not processed.\n"); return -1; } /* filesystem information collected check whether the DB has a corresponding value, * if so check by size and time whether it looks modified if not, skip */ info_from_db.path = info->path; info_from_db.name = info->name; if ( ! option_force_scan && (dbi_fill_fileinfo(&info_from_db) == 0) && (info->statbuf.st_size == info_from_db.statbuf.st_size) && (info->statbuf.st_mtim.tv_sec < info_from_db.last_seen) && (info->statbuf.st_mtim.tv_sec == info_from_db.statbuf.st_mtim.tv_sec) && (info->statbuf.st_mtim.tv_nsec == info_from_db.statbuf.st_mtim.tv_nsec)) { if ( ! option_quiet ) { LOGERR("Skip file '%s' file unchanged according to metadata\n", fullpath); } dbi_update_fileinfo_last_seen(info_from_db.id); return 1; } if ((ctx_pkg = init_md_components()) == NULL) { LOGERR("ERROR: Failed to initialize/create md contexts to be used with %s\n", fullpath); return -1; } if ((fdin=fopen(fullpath, "rb")) == NULL) { LOGERR("ERROR: Failed to open file '%s' %s\n", fullpath, strerror(errno)); destroy_md_components(ctx_pkg); return -1; } if ( ! option_quiet ) { LOGERR("Calculating hashes for file '%s'\n", fullpath); } /* TODO: proper fread related error handling */ while (true) { bytes_read = fread(buffer, sizeof(unsigned char), glbl_bufsize, fdin); if (EVP_DigestUpdate(ctx_pkg->mdctx_blake2, buffer, bytes_read) != 1) { LOGERR("ERROR: Failed to update message digest BLAKE2 of file '%s'\n", fullpath); error_in_loop = true; break; } if (EVP_DigestUpdate(ctx_pkg->mdctx_sha256, buffer, bytes_read) != 1) { LOGERR("ERROR: Failed to update message digest SHA256 of file '%s'\n", fullpath); error_in_loop = true; break; } if (EVP_DigestUpdate(ctx_pkg->mdctx_sha512, buffer, bytes_read) != 1) { LOGERR("ERROR: Failed to update message digest SHA512 of file '%s'\n", fullpath); error_in_loop = true; break; } if (bytes_read != glbl_bufsize) { if (feof(fdin) != 0) { break; } if (ferror(fdin) != 0) { LOGERR("ERROR: Failed to read from %s: %s (errno %d)\n", fullpath, strerror(errno), errno); error_in_loop = true; break; } } } fclose(fdin); fdin = NULL; if (error_in_loop) { destroy_md_components(ctx_pkg); return -1; } if (EVP_DigestFinal_ex(ctx_pkg->mdctx_blake2, md_val, &md_len) != 1) { LOGERR("ERROR: Failed to finalize MD BLAKE2 of file '%s'\n", fullpath); destroy_md_components(ctx_pkg); return -1; } convert_from_binary(md_val, md_len, info->hashes.blake2); if (EVP_DigestFinal_ex(ctx_pkg->mdctx_sha256, md_val, &md_len) != 1) { LOGERR("ERROR: Failed to finalize MD SHA256 of file '%s'\n", fullpath); destroy_md_components(ctx_pkg); return -1; } convert_from_binary(md_val, md_len, info->hashes.sha256); if (EVP_DigestFinal_ex(ctx_pkg->mdctx_sha512, md_val, &md_len) != 1) { LOGERR("ERROR: Failed to finalize MD SHA512 of file '%s'\n", fullpath); destroy_md_components(ctx_pkg); return -1; } convert_from_binary(md_val, md_len, info->hashes.sha512); destroy_md_components(ctx_pkg); return 0; } /** * Return a file info struct with path and filename fields filled. * @param key the fullpath used as key in the gdbm. * @return NULL on failure */ struct df_fileinfo *prepare_fileinfo(char *key) { char *tmp; char *fname = NULL; size_t plen=0, flen=0; struct df_fileinfo *info = NULL; if (key == NULL || key[0] == '\0') { return NULL; } if ((fname=strrchr(key, '/')) == NULL) { LOGERR("ERROR: path<->filename separation failed with '%s'\n", key); return NULL; } /* At this point the address of fname will always be equal or larger than keys */ plen = (size_t) (fname - key); fname++; /* drop the / */ flen = strlen(fname); if ((info=calloc(1, sizeof(struct df_fileinfo))) == NULL) { return NULL; } if ((tmp = calloc(plen+1, sizeof(char))) == NULL) { free(info); return NULL; } memcpy(tmp, key, plen); info->path = tmp; if ((tmp = calloc(flen+1, sizeof(char))) == NULL) { free(info->path); free(info); return NULL; } memcpy(tmp, fname, flen); info->name = tmp; DBGTRC("DEBUG: key '%s' | fname '%s' | info->path '%s' | info->name '%s'\n", key, fname, info->path, info->name); return info; } /** * Iterate over the whole gdbm content. If an entry is an unprocessed file, * process it, place the information in the database and set it as processed * in the storage. * @return 0 on success * <0 on failure */ int process_gdbm_content() { char *key, *tmpkey; struct df_fileinfo *info; int dbrc = 0, fsrc = 0; key = kv_first_key(); while (key != NULL) { DBGTRC("DEBUG: key '%s' | processed: '%c' | type '%c'\n", key, (kv_get_bool(key)) ? 'T' : 'F', kv_get_type(key)); /* file? already processed? */ if (kv_get_type(key) == 'D' || kv_get_bool(key)) { DBGTRC("DEBUG: Skip directory\n"); tmpkey = key; key = kv_next_key(tmpkey); free(tmpkey); continue; } info = prepare_fileinfo(key); if (info == NULL) { LOGERR("ERROR: Preparing struct for key %s failed.\n", key); return -1; } fsrc = process_file(info); if (fsrc < 0) { LOGERR("ERROR: Failed to process file %s\n", key); free(info->path); free(info->name); free(info); free(key); return -1; } #ifdef DEBUGBUILD print_fileinfo(info); #endif /* process_file 1 -> skip hashes etc not copied */ if (fsrc == 0) { dbrc = dbi_insert_fileinfo(info); } free(info->path); free(info->name); free(info); if (dbrc < 0) { LOGERR("ERROR: Aborting after database error.\n"); free(key); return -1; } kv_set_bool(key, true); tmpkey = key; key = kv_next_key(tmpkey); free(tmpkey); } return 0; } inline void print_fileinfo(struct df_fileinfo *info) { fprintf(stderr, "info->path = \"%s\"\n", info->path); fprintf(stderr, "info->name = \"%s\"\n", info->name); fprintf(stderr, "info->hashes.blake2 = \"%s\"\n", info->hashes.blake2); fprintf(stderr, "info->hashes.sha256 = \"%s\"\n", info->hashes.sha256); fprintf(stderr, "info->hashes.sha512 = \"%s\"\n", info->hashes.sha512); fprintf(stderr, "info->statbuf.st_size = \"%ld\"\n", info->statbuf.st_size); }