diff options
| author | Thorsten Töpper <atsutane@freethoughts.de> | 2026-02-07 21:43:17 +0100 |
|---|---|---|
| committer | Thorsten Töpper <atsutane@freethoughts.de> | 2026-02-07 21:43:17 +0100 |
| commit | eed2d1323441861f2d41f0ecc0a72fcc9190fa5f (patch) | |
| tree | 779cd7c1768504308e9957cfbc5cfc271e89f1c5 | |
| parent | b7d09007d04c3b7c38848dd05d6105f3354b6b15 (diff) | |
| download | duplicate_finder-eed2d1323441861f2d41f0ecc0a72fcc9190fa5f.tar.gz duplicate_finder-eed2d1323441861f2d41f0ecc0a72fcc9190fa5f.tar.bz2 | |
file processor: Copied from my small-utils project
| -rw-r--r-- | include/file_processor.h | 41 | ||||
| -rw-r--r-- | include/hex_conversion.h | 113 | ||||
| -rw-r--r-- | include/trace_macros.h | 23 | ||||
| -rw-r--r-- | src/file_processor.c | 292 |
4 files changed, 469 insertions, 0 deletions
diff --git a/include/file_processor.h b/include/file_processor.h new file mode 100644 index 0000000..8cfb6de --- /dev/null +++ b/include/file_processor.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: Apache-2.0 */ + +/* Copyright 2026 Thorsten Töpper + * + * vim:ts=4:sw=4:expandtab + */ +#ifndef FILE_PROCESSOR_H +#define FILE_PROCESSOR_H + +#include <sys/stat.h> + +#define DF_BYTE_SIZE_256 32 +#define DF_BYTE_SIZE_512 64 + +/* Aliases for convenience, currently all algorithms are part of the default + * provider. */ +#define DF_OSSL_BLAKE2 "BLAKE2B-512" +#define DF_OSSL_SHA256 "SHA2-256" +#define DF_OSSL_SHA512 "SHA2-512" + +/** + * information about a file + * Contains filepath, stat() results, hash values of multiple algorithms. + * TODO: Organize the paths in a global pool (list/tree/map) and only refer there + * without any free() calls triggered through the pointer in the struct. + */ +struct df_fileinfo { + char *path; /**< pointer to the path of the file */ + char *name; /**< pointer to the name of the file */ + unsigned char blake2[DF_BYTE_SIZE_512]; /**< The BLAKE2-512 hash in binary form */ + unsigned char sha256[DF_BYTE_SIZE_256]; /**< The SHA256 hash in binary form. */ + unsigned char sha512[DF_BYTE_SIZE_512]; /**< The SHA512 hash in binary form. */ + struct stat statbuf; /**< Result of lstat() call. Symlinks are to be ignored and filtered out earlier. */ +}; + + +/*=========== FUNCTIONS ===========*/ +int process_file(struct df_fileinfo *info); + +#endif + diff --git a/include/hex_conversion.h b/include/hex_conversion.h new file mode 100644 index 0000000..90ab9e4 --- /dev/null +++ b/include/hex_conversion.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: Apache-2.0 */ + +/* Copyright 2026 Thorsten Töpper + * + * vim:ts=4:sw=4:expandtab + */ +#ifndef HEX_CONVERSION_H +#define HEX_CONVERSION_H + +#include <stdlib.h> +#include <string.h> +#include <ctype.h> + +#ifdef DEBUGBUILD +#include "trace_macros.h" +#endif + +#define ishex_macro(c) ((c>='0' && c <= '9') || (c>='A' && c <= 'F') || (c>='a' && c <= 'f')) + +int convert_line(char *s); +int ishex(unsigned char c); +int ishex_string(const char *s, size_t l); +unsigned char *convert_to_binary(char *hex, unsigned char *out); +char *convert_from_binary(unsigned char *bin, size_t l, char *out); + +/* short inline functions are fine in header */ +inline int convert_line(char *s) { + size_t i = 0, l = 0; + if (s == NULL) + return -1; + l=strlen(s); + for (i=0; i<l; i++) { + s[i] = (char)toupper(s[i]); + if ((s[i] == '\r' && (l-i<3)) || (s[i] == '\n' && i==l-1)) { + s[i] = '\0'; + break; + } + } + return 0; +} + +inline int ishex(unsigned char c) { + if ((c>='0' && c <= '9') || (c>='A' && c <= 'F') || (c>='a' && c <= 'f')) { + return 1; + } + return 0; +}; + +inline int ishex_string(const char *s, size_t l) { + size_t i = 0; + if (s == 0) + return 0; + if (l == 0) + l = strlen(s); + for (; i<l; i++) { + if ( ! ishex_macro(s[i]) ) + return 0; + } + return 1; +} + +inline unsigned char *convert_to_binary(char *hex, unsigned char *out) { + char tmp[3] = {0,0,0}; + size_t length, i; + if (hex == NULL) return NULL; + length=strlen(hex); + if ( (length==0) || (length%2 == 1)) return NULL; + for (i=0; i<length; i++) { + if ( ! ishex_macro(hex[i]) ) { +#ifdef DEBUGBUILD + LOGERR("Incompatible string '%s'\n", hex); +#endif + return NULL; + } + } + if (out == NULL && ((out = calloc((length/2),sizeof(char))) == NULL)) { +#ifdef DEBUGBUILD + LOGERR("ERROR: Failed to allocate %lu bytes\n", (length/2)); +#endif + return NULL; + } + for (i=0;i<length;i+=2) { + tmp[0] = hex[i]; + tmp[1] = hex[i+1]; + out[i/2] = (unsigned char) strtol(tmp, NULL, 16); + } + return out; +} + +/* Use a large buffer and complex method, as with a simple + * way there regularly were corrupt results with gcc -O2. */ +inline char *convert_from_binary(unsigned char *bin, size_t l, char *out) { + char tmp[24]; + size_t i,pos; + if (bin == NULL || l == 0) return NULL; + if (out == NULL && (out = calloc(((l*2)+1),sizeof(char))) == NULL) { +#ifdef DEBUGBUILD + LOGERR("ERROR: Failed to allocate %lu bytes\n", ((l*2)+1)); +#endif + return NULL; + } + for (i=0; i<l; i++) { + /* Keep in mind this format is not only two characters. */ + sprintf(tmp, "%02X", (unsigned char)bin[i]); + pos=strlen(tmp); + out[i*2] = tmp[pos-2]; + out[(i*2)+1] = tmp[pos-1]; + } + return out; +} + +#endif + diff --git a/include/trace_macros.h b/include/trace_macros.h new file mode 100644 index 0000000..b27dc40 --- /dev/null +++ b/include/trace_macros.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: Apache-2.0 */ + +/* Copyright 2026 Thorsten Töpper + * + * vim:ts=4:sw=4:expandtab + */ +#ifndef TRACE_MACROS_H +#define TRACE_MACROS_H + +#include <stdio.h> + +#ifndef LOGERR +#define LOGERR(...) {fprintf(stderr, "[%s:%d] %s: ", __FILE__, __LINE__, __func__); fprintf(stderr, __VA_ARGS__);} +#endif + +#ifdef DEBUGBUILD +#define DBGTRC(...) LOGERR(__VA_ARGS__) +#else +#define DBGTRC(...) +#endif + +#endif + diff --git a/src/file_processor.c b/src/file_processor.c new file mode 100644 index 0000000..1cfed46 --- /dev/null +++ b/src/file_processor.c @@ -0,0 +1,292 @@ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Copyright 2026 Thorsten Töpper + * + * @file file_processor.c + * + * vim:ts=4:sw=4:expandtab + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdbool.h> +#include <stdint.h> +#include <errno.h> + + +/* https://docs.openssl.org/master/man3/EVP_DigestInit/ */ +#include <openssl/evp.h> + +#include "file_processor.h" +#include "trace_macros.h" + + +/*=========== DEFINES, CONSTANTS AND TYPES ===========*/ + +#ifndef BUFSIZE4MIB +/* Personnal observation: dd bs=4M usually gets good performance regardless of SSD/HDD or USB 2.x/3.x */ +#define BUFSIZE4MIB 4194304 +#endif + + +/** + * Objects from the OpenSSL library + * Improved later code readability by placing those together and manage outside the file handling code + */ +struct df_md_components { + EVP_MD_CTX *mdctx_blake2; + EVP_MD_CTX *mdctx_sha256; + EVP_MD_CTX *mdctx_sha512; +}; + + +/*=========== GLOBAL VARIABLES ===========*/ + +/** TODO: Should be set via command line parameter, move to handling when implementing + */ +size_t glbl_bufsize = BUFSIZE4MIB; + +const EVP_MD *glbl_md_blake2 = NULL; +const EVP_MD *glbl_md_sha256 = NULL; +const EVP_MD *glbl_md_sha512 = NULL; + + + +/*=========== FUNCTIONS ===========*/ +struct df_md_components *init_md_components(); +void destroy_md_components(struct df_md_components *pkg); + + +/** + * Prepare a df_md_components struct for active usage. + */ +inline struct df_md_components *init_md_components() { + struct df_md_components *pkg = NULL; + + if ((pkg=calloc(1,sizeof(struct df_md_components))) == NULL) { + LOGERR("ERROR: Failed to allocate heap memory for a struct df_md_components errno %d: %s\n", + errno, strerror(errno)); + return NULL; + } + + /* TODO: research whether performance difference is really worth this additional complexity, + * the util is processing files from storage as single thread, not tons of <1KiB blobs in parallel. + * So a few nano- or microseconds are mostly insignificant compared to the I/O throttle. */ + if (glbl_md_blake2 == NULL) { + glbl_md_blake2 = EVP_blake2b512(); + if (glbl_md_blake2 == NULL) { + LOGERR("ERROR: Failed to fetch EVP_MD for BLAKE2\n"); + return NULL; + } + } + if (glbl_md_sha256 == NULL) { + glbl_md_sha256 = EVP_sha256(); + if (glbl_md_sha256 == NULL) { + LOGERR("ERROR: Failed to fetch EVP_MD for SHA2_256\n"); + return NULL; + } + } + if (glbl_md_sha512 == NULL) { + glbl_md_sha512 = EVP_sha512(); + if (glbl_md_sha512 == NULL) { + LOGERR("ERROR: Failed to fetch EVP_MD for SHA2_512\n"); + return NULL; + } + } + + /* Create the contexts */ + if ((pkg->mdctx_blake2 = EVP_MD_CTX_new()) == NULL) { + LOGERR("ERROR: Failed to create context for BLAKE2\n"); + free(pkg); + return NULL; + } + + if ((pkg->mdctx_sha256 = EVP_MD_CTX_new()) == NULL) { + LOGERR("ERROR: Failed to create context for SHA256\n"); + EVP_MD_CTX_free(pkg->mdctx_blake2); + free(pkg); + return NULL; + } + + if ((pkg->mdctx_sha512 = EVP_MD_CTX_new()) == NULL) { + LOGERR("ERROR: Failed to create context for SHA512\n"); + EVP_MD_CTX_free(pkg->mdctx_blake2); + EVP_MD_CTX_free(pkg->mdctx_sha256); + free(pkg); + return NULL; + } + + /* Initialize them */ + if (EVP_DigestInit_ex2(pkg->mdctx_blake2, glbl_md_blake2, NULL) != 1) { + LOGERR("ERROR: Failed to initialize BLAKE2 context\n"); + EVP_MD_CTX_free(pkg->mdctx_blake2); + EVP_MD_CTX_free(pkg->mdctx_sha256); + EVP_MD_CTX_free(pkg->mdctx_sha512); + free(pkg); + return NULL; + } + + if (EVP_DigestInit_ex2(pkg->mdctx_sha256, glbl_md_sha256, NULL) != 1) { + LOGERR("ERROR: Failed to initialize SHA256 context\n"); + EVP_MD_CTX_free(pkg->mdctx_blake2); + EVP_MD_CTX_free(pkg->mdctx_sha256); + EVP_MD_CTX_free(pkg->mdctx_sha512); + free(pkg); + return NULL; + } + + if (EVP_DigestInit_ex2(pkg->mdctx_sha512, glbl_md_sha512, NULL) != 1) { + LOGERR("ERROR: Failed to initialize SHA512 context\n"); + EVP_MD_CTX_free(pkg->mdctx_blake2); + EVP_MD_CTX_free(pkg->mdctx_sha256); + EVP_MD_CTX_free(pkg->mdctx_sha512); + free(pkg); + return NULL; + } + + return pkg; +} + +/** + * Free all memory related to the given struct including itself + * @param pkg the struct to destroy + */ +inline void destroy_md_components(struct df_md_components *pkg) { + if (pkg == NULL) { + return; + } + if (pkg->mdctx_blake2 != NULL) { + EVP_MD_CTX_free(pkg->mdctx_blake2); + } + if (pkg->mdctx_sha256 != NULL) { + EVP_MD_CTX_free(pkg->mdctx_sha256); + } + if (pkg->mdctx_sha512 != NULL) { + EVP_MD_CTX_free(pkg->mdctx_sha512); + } + free(pkg); +} + + +/** + * Read the file defined by path and name in the argument struct, and add stat + * results and binary represented hashes of the file into the struct. + * + * @param info struct contains the path of the file to read, results will be + * stored there. + * + * @return 0 on success + * -1 on failure + */ +int process_file(struct df_fileinfo *info) { + FILE *fdin = NULL; + char fullpath[4096]; + unsigned char buffer[glbl_bufsize]; + size_t bytes_read; + struct df_md_components *ctx_pkg; + bool error_in_loop = false; + + unsigned char md_val[EVP_MAX_MD_SIZE]; + unsigned int md_len; + + if (info == NULL || info->name == NULL || info->path == NULL || + info->name[0] == '\0' || info->path[0] == '\0') { + LOGERR("ERROR: Not enough information to construct a full path.\n"); + return -1; + } + + + if (snprintf(fullpath, 4096, "%s/%s", info->path, info->name) < 0) { + LOGERR("ERROR: Failed to print fullpath string into stack memory: %s (errno %d)\n", + strerror(errno), errno); + return -1; + } + + if (lstat(fullpath, &(info->statbuf)) == -1) { + LOGERR("ERROR: lstat() call failed for file %s: %s (errno %d)\n", + fullpath, strerror(errno), errno); + return -1; + } + + if ((info->statbuf.st_mode & S_IFMT) != S_IFREG) { + LOGERR("ERROR: Non-regular files are not processed.\n"); + return -1; + } + + if ((ctx_pkg = init_md_components()) == NULL) { + LOGERR("ERROR: Failed to initialize/create md contexts to be used with %s\n", + fullpath); + return -1; + } + + if ((fdin=fopen(fullpath, "rb")) == NULL) { + LOGERR("ERROR: Failed to open file '%s' %s\n", fullpath, strerror(errno)); + destroy_md_components(ctx_pkg); + return -1; + } + + /* TODO: proper fread related error handling */ + while (true) { + bytes_read = fread(buffer, sizeof(unsigned char), glbl_bufsize, fdin); + + if (EVP_DigestUpdate(ctx_pkg->mdctx_blake2, buffer, bytes_read) != 1) { + LOGERR("ERROR: Failed to update message digest BLAKE2 of file '%s'\n", fullpath); + error_in_loop = true; + break; + } + if (EVP_DigestUpdate(ctx_pkg->mdctx_sha256, buffer, bytes_read) != 1) { + LOGERR("ERROR: Failed to update message digest SHA256 of file '%s'\n", fullpath); + error_in_loop = true; + break; + } + if (EVP_DigestUpdate(ctx_pkg->mdctx_sha512, buffer, bytes_read) != 1) { + LOGERR("ERROR: Failed to update message digest SHA512 of file '%s'\n", fullpath); + error_in_loop = true; + break; + } + if (bytes_read != glbl_bufsize) { + if (feof(fdin) != 0) { + break; + } + if (ferror(fdin) != 0) { + LOGERR("ERROR: Failed to read from %s: %s (errno %d)\n", + fullpath, strerror(errno), errno); + error_in_loop = true; + break; + } + } + } + fclose(fdin); + fdin = NULL; + if (error_in_loop) { + destroy_md_components(ctx_pkg); + return -1; + } + + if (EVP_DigestFinal_ex(ctx_pkg->mdctx_blake2, md_val, &md_len) != 1) { + LOGERR("ERROR: Failed to finalize MD BLAKE2 of file '%s'\n", fullpath); + destroy_md_components(ctx_pkg); + return -1; + } + memcpy(info->blake2, md_val, md_len); + + if (EVP_DigestFinal_ex(ctx_pkg->mdctx_sha256, md_val, &md_len) != 1) { + LOGERR("ERROR: Failed to finalize MD SHA256 of file '%s'\n", fullpath); + destroy_md_components(ctx_pkg); + return -1; + } + memcpy(info->sha256, md_val, md_len); + + if (EVP_DigestFinal_ex(ctx_pkg->mdctx_sha512, md_val, &md_len) != 1) { + LOGERR("ERROR: Failed to finalize MD SHA512 of file '%s'\n", fullpath); + destroy_md_components(ctx_pkg); + return -1; + } + memcpy(info->sha512, md_val, md_len); + + return 0; +} + + |
