aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThorsten Töpper <atsutane@freethoughts.de>2026-02-07 21:43:17 +0100
committerThorsten Töpper <atsutane@freethoughts.de>2026-02-07 21:43:17 +0100
commiteed2d1323441861f2d41f0ecc0a72fcc9190fa5f (patch)
tree779cd7c1768504308e9957cfbc5cfc271e89f1c5
parentb7d09007d04c3b7c38848dd05d6105f3354b6b15 (diff)
downloadduplicate_finder-eed2d1323441861f2d41f0ecc0a72fcc9190fa5f.tar.gz
duplicate_finder-eed2d1323441861f2d41f0ecc0a72fcc9190fa5f.tar.bz2
file processor: Copied from my small-utils project
-rw-r--r--include/file_processor.h41
-rw-r--r--include/hex_conversion.h113
-rw-r--r--include/trace_macros.h23
-rw-r--r--src/file_processor.c292
4 files changed, 469 insertions, 0 deletions
diff --git a/include/file_processor.h b/include/file_processor.h
new file mode 100644
index 0000000..8cfb6de
--- /dev/null
+++ b/include/file_processor.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/* Copyright 2026 Thorsten Töpper
+ *
+ * vim:ts=4:sw=4:expandtab
+ */
+#ifndef FILE_PROCESSOR_H
+#define FILE_PROCESSOR_H
+
+#include <sys/stat.h>
+
+#define DF_BYTE_SIZE_256 32
+#define DF_BYTE_SIZE_512 64
+
+/* Aliases for convenience, currently all algorithms are part of the default
+ * provider. */
+#define DF_OSSL_BLAKE2 "BLAKE2B-512"
+#define DF_OSSL_SHA256 "SHA2-256"
+#define DF_OSSL_SHA512 "SHA2-512"
+
+/**
+ * information about a file
+ * Contains filepath, stat() results, hash values of multiple algorithms.
+ * TODO: Organize the paths in a global pool (list/tree/map) and only refer there
+ * without any free() calls triggered through the pointer in the struct.
+ */
+struct df_fileinfo {
+ char *path; /**< pointer to the path of the file */
+ char *name; /**< pointer to the name of the file */
+ unsigned char blake2[DF_BYTE_SIZE_512]; /**< The BLAKE2-512 hash in binary form */
+ unsigned char sha256[DF_BYTE_SIZE_256]; /**< The SHA256 hash in binary form. */
+ unsigned char sha512[DF_BYTE_SIZE_512]; /**< The SHA512 hash in binary form. */
+ struct stat statbuf; /**< Result of lstat() call. Symlinks are to be ignored and filtered out earlier. */
+};
+
+
+/*=========== FUNCTIONS ===========*/
+int process_file(struct df_fileinfo *info);
+
+#endif
+
diff --git a/include/hex_conversion.h b/include/hex_conversion.h
new file mode 100644
index 0000000..90ab9e4
--- /dev/null
+++ b/include/hex_conversion.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/* Copyright 2026 Thorsten Töpper
+ *
+ * vim:ts=4:sw=4:expandtab
+ */
+#ifndef HEX_CONVERSION_H
+#define HEX_CONVERSION_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#ifdef DEBUGBUILD
+#include "trace_macros.h"
+#endif
+
+#define ishex_macro(c) ((c>='0' && c <= '9') || (c>='A' && c <= 'F') || (c>='a' && c <= 'f'))
+
+int convert_line(char *s);
+int ishex(unsigned char c);
+int ishex_string(const char *s, size_t l);
+unsigned char *convert_to_binary(char *hex, unsigned char *out);
+char *convert_from_binary(unsigned char *bin, size_t l, char *out);
+
+/* short inline functions are fine in header */
+inline int convert_line(char *s) {
+ size_t i = 0, l = 0;
+ if (s == NULL)
+ return -1;
+ l=strlen(s);
+ for (i=0; i<l; i++) {
+ s[i] = (char)toupper(s[i]);
+ if ((s[i] == '\r' && (l-i<3)) || (s[i] == '\n' && i==l-1)) {
+ s[i] = '\0';
+ break;
+ }
+ }
+ return 0;
+}
+
+inline int ishex(unsigned char c) {
+ if ((c>='0' && c <= '9') || (c>='A' && c <= 'F') || (c>='a' && c <= 'f')) {
+ return 1;
+ }
+ return 0;
+};
+
+inline int ishex_string(const char *s, size_t l) {
+ size_t i = 0;
+ if (s == 0)
+ return 0;
+ if (l == 0)
+ l = strlen(s);
+ for (; i<l; i++) {
+ if ( ! ishex_macro(s[i]) )
+ return 0;
+ }
+ return 1;
+}
+
+inline unsigned char *convert_to_binary(char *hex, unsigned char *out) {
+ char tmp[3] = {0,0,0};
+ size_t length, i;
+ if (hex == NULL) return NULL;
+ length=strlen(hex);
+ if ( (length==0) || (length%2 == 1)) return NULL;
+ for (i=0; i<length; i++) {
+ if ( ! ishex_macro(hex[i]) ) {
+#ifdef DEBUGBUILD
+ LOGERR("Incompatible string '%s'\n", hex);
+#endif
+ return NULL;
+ }
+ }
+ if (out == NULL && ((out = calloc((length/2),sizeof(char))) == NULL)) {
+#ifdef DEBUGBUILD
+ LOGERR("ERROR: Failed to allocate %lu bytes\n", (length/2));
+#endif
+ return NULL;
+ }
+ for (i=0;i<length;i+=2) {
+ tmp[0] = hex[i];
+ tmp[1] = hex[i+1];
+ out[i/2] = (unsigned char) strtol(tmp, NULL, 16);
+ }
+ return out;
+}
+
+/* Use a large buffer and complex method, as with a simple
+ * way there regularly were corrupt results with gcc -O2. */
+inline char *convert_from_binary(unsigned char *bin, size_t l, char *out) {
+ char tmp[24];
+ size_t i,pos;
+ if (bin == NULL || l == 0) return NULL;
+ if (out == NULL && (out = calloc(((l*2)+1),sizeof(char))) == NULL) {
+#ifdef DEBUGBUILD
+ LOGERR("ERROR: Failed to allocate %lu bytes\n", ((l*2)+1));
+#endif
+ return NULL;
+ }
+ for (i=0; i<l; i++) {
+ /* Keep in mind this format is not only two characters. */
+ sprintf(tmp, "%02X", (unsigned char)bin[i]);
+ pos=strlen(tmp);
+ out[i*2] = tmp[pos-2];
+ out[(i*2)+1] = tmp[pos-1];
+ }
+ return out;
+}
+
+#endif
+
diff --git a/include/trace_macros.h b/include/trace_macros.h
new file mode 100644
index 0000000..b27dc40
--- /dev/null
+++ b/include/trace_macros.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/* Copyright 2026 Thorsten Töpper
+ *
+ * vim:ts=4:sw=4:expandtab
+ */
+#ifndef TRACE_MACROS_H
+#define TRACE_MACROS_H
+
+#include <stdio.h>
+
+#ifndef LOGERR
+#define LOGERR(...) {fprintf(stderr, "[%s:%d] %s: ", __FILE__, __LINE__, __func__); fprintf(stderr, __VA_ARGS__);}
+#endif
+
+#ifdef DEBUGBUILD
+#define DBGTRC(...) LOGERR(__VA_ARGS__)
+#else
+#define DBGTRC(...)
+#endif
+
+#endif
+
diff --git a/src/file_processor.c b/src/file_processor.c
new file mode 100644
index 0000000..1cfed46
--- /dev/null
+++ b/src/file_processor.c
@@ -0,0 +1,292 @@
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Copyright 2026 Thorsten Töpper
+ *
+ * @file file_processor.c
+ *
+ * vim:ts=4:sw=4:expandtab
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <errno.h>
+
+
+/* https://docs.openssl.org/master/man3/EVP_DigestInit/ */
+#include <openssl/evp.h>
+
+#include "file_processor.h"
+#include "trace_macros.h"
+
+
+/*=========== DEFINES, CONSTANTS AND TYPES ===========*/
+
+#ifndef BUFSIZE4MIB
+/* Personnal observation: dd bs=4M usually gets good performance regardless of SSD/HDD or USB 2.x/3.x */
+#define BUFSIZE4MIB 4194304
+#endif
+
+
+/**
+ * Objects from the OpenSSL library
+ * Improved later code readability by placing those together and manage outside the file handling code
+ */
+struct df_md_components {
+ EVP_MD_CTX *mdctx_blake2;
+ EVP_MD_CTX *mdctx_sha256;
+ EVP_MD_CTX *mdctx_sha512;
+};
+
+
+/*=========== GLOBAL VARIABLES ===========*/
+
+/** TODO: Should be set via command line parameter, move to handling when implementing
+ */
+size_t glbl_bufsize = BUFSIZE4MIB;
+
+const EVP_MD *glbl_md_blake2 = NULL;
+const EVP_MD *glbl_md_sha256 = NULL;
+const EVP_MD *glbl_md_sha512 = NULL;
+
+
+
+/*=========== FUNCTIONS ===========*/
+struct df_md_components *init_md_components();
+void destroy_md_components(struct df_md_components *pkg);
+
+
+/**
+ * Prepare a df_md_components struct for active usage.
+ */
+inline struct df_md_components *init_md_components() {
+ struct df_md_components *pkg = NULL;
+
+ if ((pkg=calloc(1,sizeof(struct df_md_components))) == NULL) {
+ LOGERR("ERROR: Failed to allocate heap memory for a struct df_md_components errno %d: %s\n",
+ errno, strerror(errno));
+ return NULL;
+ }
+
+ /* TODO: research whether performance difference is really worth this additional complexity,
+ * the util is processing files from storage as single thread, not tons of <1KiB blobs in parallel.
+ * So a few nano- or microseconds are mostly insignificant compared to the I/O throttle. */
+ if (glbl_md_blake2 == NULL) {
+ glbl_md_blake2 = EVP_blake2b512();
+ if (glbl_md_blake2 == NULL) {
+ LOGERR("ERROR: Failed to fetch EVP_MD for BLAKE2\n");
+ return NULL;
+ }
+ }
+ if (glbl_md_sha256 == NULL) {
+ glbl_md_sha256 = EVP_sha256();
+ if (glbl_md_sha256 == NULL) {
+ LOGERR("ERROR: Failed to fetch EVP_MD for SHA2_256\n");
+ return NULL;
+ }
+ }
+ if (glbl_md_sha512 == NULL) {
+ glbl_md_sha512 = EVP_sha512();
+ if (glbl_md_sha512 == NULL) {
+ LOGERR("ERROR: Failed to fetch EVP_MD for SHA2_512\n");
+ return NULL;
+ }
+ }
+
+ /* Create the contexts */
+ if ((pkg->mdctx_blake2 = EVP_MD_CTX_new()) == NULL) {
+ LOGERR("ERROR: Failed to create context for BLAKE2\n");
+ free(pkg);
+ return NULL;
+ }
+
+ if ((pkg->mdctx_sha256 = EVP_MD_CTX_new()) == NULL) {
+ LOGERR("ERROR: Failed to create context for SHA256\n");
+ EVP_MD_CTX_free(pkg->mdctx_blake2);
+ free(pkg);
+ return NULL;
+ }
+
+ if ((pkg->mdctx_sha512 = EVP_MD_CTX_new()) == NULL) {
+ LOGERR("ERROR: Failed to create context for SHA512\n");
+ EVP_MD_CTX_free(pkg->mdctx_blake2);
+ EVP_MD_CTX_free(pkg->mdctx_sha256);
+ free(pkg);
+ return NULL;
+ }
+
+ /* Initialize them */
+ if (EVP_DigestInit_ex2(pkg->mdctx_blake2, glbl_md_blake2, NULL) != 1) {
+ LOGERR("ERROR: Failed to initialize BLAKE2 context\n");
+ EVP_MD_CTX_free(pkg->mdctx_blake2);
+ EVP_MD_CTX_free(pkg->mdctx_sha256);
+ EVP_MD_CTX_free(pkg->mdctx_sha512);
+ free(pkg);
+ return NULL;
+ }
+
+ if (EVP_DigestInit_ex2(pkg->mdctx_sha256, glbl_md_sha256, NULL) != 1) {
+ LOGERR("ERROR: Failed to initialize SHA256 context\n");
+ EVP_MD_CTX_free(pkg->mdctx_blake2);
+ EVP_MD_CTX_free(pkg->mdctx_sha256);
+ EVP_MD_CTX_free(pkg->mdctx_sha512);
+ free(pkg);
+ return NULL;
+ }
+
+ if (EVP_DigestInit_ex2(pkg->mdctx_sha512, glbl_md_sha512, NULL) != 1) {
+ LOGERR("ERROR: Failed to initialize SHA512 context\n");
+ EVP_MD_CTX_free(pkg->mdctx_blake2);
+ EVP_MD_CTX_free(pkg->mdctx_sha256);
+ EVP_MD_CTX_free(pkg->mdctx_sha512);
+ free(pkg);
+ return NULL;
+ }
+
+ return pkg;
+}
+
+/**
+ * Free all memory related to the given struct including itself
+ * @param pkg the struct to destroy
+ */
+inline void destroy_md_components(struct df_md_components *pkg) {
+ if (pkg == NULL) {
+ return;
+ }
+ if (pkg->mdctx_blake2 != NULL) {
+ EVP_MD_CTX_free(pkg->mdctx_blake2);
+ }
+ if (pkg->mdctx_sha256 != NULL) {
+ EVP_MD_CTX_free(pkg->mdctx_sha256);
+ }
+ if (pkg->mdctx_sha512 != NULL) {
+ EVP_MD_CTX_free(pkg->mdctx_sha512);
+ }
+ free(pkg);
+}
+
+
+/**
+ * Read the file defined by path and name in the argument struct, and add stat
+ * results and binary represented hashes of the file into the struct.
+ *
+ * @param info struct contains the path of the file to read, results will be
+ * stored there.
+ *
+ * @return 0 on success
+ * -1 on failure
+ */
+int process_file(struct df_fileinfo *info) {
+ FILE *fdin = NULL;
+ char fullpath[4096];
+ unsigned char buffer[glbl_bufsize];
+ size_t bytes_read;
+ struct df_md_components *ctx_pkg;
+ bool error_in_loop = false;
+
+ unsigned char md_val[EVP_MAX_MD_SIZE];
+ unsigned int md_len;
+
+ if (info == NULL || info->name == NULL || info->path == NULL ||
+ info->name[0] == '\0' || info->path[0] == '\0') {
+ LOGERR("ERROR: Not enough information to construct a full path.\n");
+ return -1;
+ }
+
+
+ if (snprintf(fullpath, 4096, "%s/%s", info->path, info->name) < 0) {
+ LOGERR("ERROR: Failed to print fullpath string into stack memory: %s (errno %d)\n",
+ strerror(errno), errno);
+ return -1;
+ }
+
+ if (lstat(fullpath, &(info->statbuf)) == -1) {
+ LOGERR("ERROR: lstat() call failed for file %s: %s (errno %d)\n",
+ fullpath, strerror(errno), errno);
+ return -1;
+ }
+
+ if ((info->statbuf.st_mode & S_IFMT) != S_IFREG) {
+ LOGERR("ERROR: Non-regular files are not processed.\n");
+ return -1;
+ }
+
+ if ((ctx_pkg = init_md_components()) == NULL) {
+ LOGERR("ERROR: Failed to initialize/create md contexts to be used with %s\n",
+ fullpath);
+ return -1;
+ }
+
+ if ((fdin=fopen(fullpath, "rb")) == NULL) {
+ LOGERR("ERROR: Failed to open file '%s' %s\n", fullpath, strerror(errno));
+ destroy_md_components(ctx_pkg);
+ return -1;
+ }
+
+ /* TODO: proper fread related error handling */
+ while (true) {
+ bytes_read = fread(buffer, sizeof(unsigned char), glbl_bufsize, fdin);
+
+ if (EVP_DigestUpdate(ctx_pkg->mdctx_blake2, buffer, bytes_read) != 1) {
+ LOGERR("ERROR: Failed to update message digest BLAKE2 of file '%s'\n", fullpath);
+ error_in_loop = true;
+ break;
+ }
+ if (EVP_DigestUpdate(ctx_pkg->mdctx_sha256, buffer, bytes_read) != 1) {
+ LOGERR("ERROR: Failed to update message digest SHA256 of file '%s'\n", fullpath);
+ error_in_loop = true;
+ break;
+ }
+ if (EVP_DigestUpdate(ctx_pkg->mdctx_sha512, buffer, bytes_read) != 1) {
+ LOGERR("ERROR: Failed to update message digest SHA512 of file '%s'\n", fullpath);
+ error_in_loop = true;
+ break;
+ }
+ if (bytes_read != glbl_bufsize) {
+ if (feof(fdin) != 0) {
+ break;
+ }
+ if (ferror(fdin) != 0) {
+ LOGERR("ERROR: Failed to read from %s: %s (errno %d)\n",
+ fullpath, strerror(errno), errno);
+ error_in_loop = true;
+ break;
+ }
+ }
+ }
+ fclose(fdin);
+ fdin = NULL;
+ if (error_in_loop) {
+ destroy_md_components(ctx_pkg);
+ return -1;
+ }
+
+ if (EVP_DigestFinal_ex(ctx_pkg->mdctx_blake2, md_val, &md_len) != 1) {
+ LOGERR("ERROR: Failed to finalize MD BLAKE2 of file '%s'\n", fullpath);
+ destroy_md_components(ctx_pkg);
+ return -1;
+ }
+ memcpy(info->blake2, md_val, md_len);
+
+ if (EVP_DigestFinal_ex(ctx_pkg->mdctx_sha256, md_val, &md_len) != 1) {
+ LOGERR("ERROR: Failed to finalize MD SHA256 of file '%s'\n", fullpath);
+ destroy_md_components(ctx_pkg);
+ return -1;
+ }
+ memcpy(info->sha256, md_val, md_len);
+
+ if (EVP_DigestFinal_ex(ctx_pkg->mdctx_sha512, md_val, &md_len) != 1) {
+ LOGERR("ERROR: Failed to finalize MD SHA512 of file '%s'\n", fullpath);
+ destroy_md_components(ctx_pkg);
+ return -1;
+ }
+ memcpy(info->sha512, md_val, md_len);
+
+ return 0;
+}
+
+