From a3b5ddabd37cce398e84f0e98fa9c3318ee8df86 Mon Sep 17 00:00:00 2001 From: Thorsten Töpper Date: Tue, 24 Feb 2026 23:01:40 +0100 Subject: Identify duplicates by identical hash sets. --- src/database_interaction.c | 230 +++++++++++++++++++++++++++++++++++++++++++++ src/duplicate_finder.c | 14 ++- 2 files changed, 243 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/database_interaction.c b/src/database_interaction.c index 37ce32c..e9e0b86 100644 --- a/src/database_interaction.c +++ b/src/database_interaction.c @@ -53,10 +53,13 @@ sqlite3 *dbconn = NULL; /* The statements will be wrapped via a function dbi_STATEMENTNAME() for the outside */ sqlite3_stmt *select_filename_by_id, *select_filename_by_name, + *select_filename_complete_table, *select_path_by_id, *select_path_by_pathname, + *select_hashes_all_ids, *select_hashes_by_id, *select_hashes_by_strings, + *select_hashes_complete_table, *select_fileinfo_by_id, *select_fileinfo_by_id_resolved, *select_fileinfo_by_path_id, @@ -64,6 +67,8 @@ sqlite3_stmt *select_filename_by_id, *select_fileinfo_by_path_filename_ids, *select_fileinfo_by_hash_path_filename_ids, *select_fileinfo_by_hash_id, + *select_fileinfo_by_hash_id_key_only, + *select_fileinfo_by_hash_id_resolved, *select_fileinfo_complete_table, *select_fileinfo_complete_table_resolved; @@ -77,10 +82,16 @@ sqlite3_stmt *update_fileinfo_last_seen, sqlite3_stmt *delete_fileinfo_by_id; +sqlite3_stmt *count_fileinfo_by_hash_id, + *count_fileinfo_by_filename, + *count_hashes, + *count_fileinfo; + /*=========== FUNCTIONS ===========*/ void create_tables(); int prepare_statements(); char *select_string_by_int(sqlite3_stmt *st, int64_t id); +int64_t call_count_query(sqlite3_stmt *st); /* Writing this block way too often */ #define DBCONN_CHECK(x) \ @@ -124,16 +135,21 @@ void dbi_close() { LOCAL_FINALIZE(select_filename_by_id); LOCAL_FINALIZE(select_filename_by_name); + LOCAL_FINALIZE(select_filename_complete_table); LOCAL_FINALIZE(select_path_by_id); LOCAL_FINALIZE(select_path_by_pathname); + LOCAL_FINALIZE(select_hashes_all_ids); LOCAL_FINALIZE(select_hashes_by_id); LOCAL_FINALIZE(select_hashes_by_strings); + LOCAL_FINALIZE(select_hashes_complete_table); LOCAL_FINALIZE(select_fileinfo_by_id); LOCAL_FINALIZE(select_fileinfo_by_id_resolved); LOCAL_FINALIZE(select_fileinfo_by_path_id); LOCAL_FINALIZE(select_fileinfo_by_filename_id); LOCAL_FINALIZE(select_fileinfo_by_path_filename_ids); LOCAL_FINALIZE(select_fileinfo_by_hash_id); + LOCAL_FINALIZE(select_fileinfo_by_hash_id_resolved); + LOCAL_FINALIZE(select_fileinfo_by_hash_id_key_only); LOCAL_FINALIZE(select_fileinfo_complete_table); LOCAL_FINALIZE(select_fileinfo_complete_table_resolved); @@ -147,6 +163,10 @@ void dbi_close() { LOCAL_FINALIZE(delete_fileinfo_by_id); + LOCAL_FINALIZE(count_fileinfo); + LOCAL_FINALIZE(count_fileinfo_by_hash_id); + LOCAL_FINALIZE(count_fileinfo_by_filename); + LOCAL_FINALIZE(count_hashes); #undef LOCAL_FINALIZE sqlite3_close(dbconn); @@ -208,6 +228,7 @@ int prepare_statements() { LOCAL_PREP_STMT("SELECT pathname FROM paths WHERE id = ? ;", &select_path_by_id); LOCAL_PREP_STMT("SELECT id FROM paths WHERE pathname = ? ;", &select_path_by_pathname); + LOCAL_PREP_STMT("SELECT hashes.id FROM hashes ;", &select_hashes_all_ids); LOCAL_PREP_STMT("SELECT blake2, sha256, sha512 FROM hashes WHERE id = ? ;", &select_hashes_by_id); LOCAL_PREP_STMT("SELECT id FROM hashes WHERE blake2 = ? AND sha256 = ? AND sha512 = ? ;", &select_hashes_by_strings); @@ -218,14 +239,19 @@ int prepare_statements() { LOCAL_PREP_STMT("SELECT * FROM fileinfo WHERE p_id = ? AND fn_id = ? ;", &select_fileinfo_by_path_filename_ids); LOCAL_PREP_STMT("SELECT * FROM fileinfo WHERE h_id = ? AND p_id = ? AND fn_id = ? ;", &select_fileinfo_by_hash_path_filename_ids); LOCAL_PREP_STMT("SELECT * FROM fileinfo WHERE h_id = ? ;", &select_fileinfo_by_hash_id); + LOCAL_PREP_STMT("SELECT id FROM fileinfo WHERE h_id = ? ;", &select_fileinfo_by_hash_id_key_only); /* TODO: so far the only query with JOINs or masking it in another way? * Many years since */ LOCAL_PREP_STMT("SELECT paths.pathname, filenames.name, hashes.blake2, hashes.sha256, hashes.sha512, fileinfo.size, fileinfo.last_seen, fileinfo.stat_struct FROM fileinfo INNER JOIN paths ON fileinfo.p_id = paths.id INNER JOIN filenames ON fileinfo.fn_id = filenames.id INNER JOIN hashes ON fileinfo.h_id = hashes.id WHERE fileinfo.id = ? ;", &select_fileinfo_by_id_resolved); LOCAL_PREP_STMT("SELECT paths.pathname, filenames.name, hashes.blake2, hashes.sha256, hashes.sha512, fileinfo.size, fileinfo.last_seen, fileinfo.stat_struct FROM fileinfo INNER JOIN paths ON fileinfo.p_id = paths.id INNER JOIN filenames ON fileinfo.fn_id = filenames.id INNER JOIN hashes ON fileinfo.h_id = hashes.id ;", &select_fileinfo_complete_table_resolved); + LOCAL_PREP_STMT("SELECT paths.pathname, filenames.name, hashes.blake2, hashes.sha256, hashes.sha512, fileinfo.size, fileinfo.last_seen, fileinfo.stat_struct FROM fileinfo INNER JOIN paths ON fileinfo.p_id = paths.id INNER JOIN filenames ON fileinfo.fn_id = filenames.id INNER JOIN hashes ON fileinfo.h_id = hashes.id WHERE fileinfo.h_id = ?;", &select_fileinfo_by_hash_id_resolved); LOCAL_PREP_STMT("SELECT p_id, fn_id, h_id, size, last_seen, stat_struct FROM fileinfo ;", &select_fileinfo_complete_table); + LOCAL_PREP_STMT("SELECT * FROM filenames;", &select_filename_complete_table); + LOCAL_PREP_STMT("SELECT * FROM hashes;", &select_hashes_complete_table); + /* INSERT */ LOCAL_PREP_STMT("INSERT INTO filenames (name) VALUES (?);", &insert_filename); LOCAL_PREP_STMT("INSERT INTO paths (pathname) VALUES (?);", &insert_pathname); @@ -241,6 +267,12 @@ int prepare_statements() { /* DELETE */ LOCAL_PREP_STMT("DELETE FROM fileinfo WHERE id = ? ;", &delete_fileinfo_by_id); + /* COUNT */ + LOCAL_PREP_STMT("SELECT COUNT(fileinfo.id) FROM fileinfo;", &count_fileinfo); + LOCAL_PREP_STMT("SELECT COUNT(fileinfo.h_id) FROM fileinfo WHERE fileinfo.h_id = ?;", &count_fileinfo_by_hash_id); + LOCAL_PREP_STMT("SELECT COUNT(fileinfo.fn_id) FROM fileinfo WHERE fileinfo.fn_id = ?;", &count_fileinfo_by_filename); + LOCAL_PREP_STMT("SELECT COUNT(hashes.id) FROM hashes ;", &count_hashes); + #undef LOCAL_PREP_STMT return 0; } @@ -935,6 +967,204 @@ int dbi_print_fileinfo_resolved(FILE *out) { return rc; } + +inline int64_t call_count_query(sqlite3_stmt *st) { + int64_t count; + int strc = 0; + + DBCONN_CHECK(-1); + + sqlite3_reset(st); + strc = sqlite3_step(st); + + if (strc != SQLITE_ROW && strc != SQLITE_DONE) { + LOGERR("ERROR: Failed step: %s\n", sqlite3_errmsg(dbconn)); + return -1; + } + + count = (int64_t) sqlite3_column_int64(st, 0); + + sqlite3_reset(st); + + return count; +} + +/** + * Get an array containing all ids from table hashes with the first field + * containing the complete length of the array including this field. + * + * @return NULL on failure + * an array on the heap which must be freed by the caller. + */ +int64_t *dbi_select_hashes_all_ids() { + int64_t *result = NULL; + int64_t rows = 0, id = 1, pos = 1; + int strc = 0; + sqlite3_stmt *st = select_hashes_all_ids; + + DBCONN_CHECK(NULL); + + rows = call_count_query(count_hashes); + + if (rows < 0) { return NULL; } + + rows++; +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsign-conversion" + if ((result = calloc(rows, sizeof(int64_t))) == NULL) { +#pragma GCC diagnostic pop + LOGERR("ERROR: Failed to allocate heap memory\n"); + return NULL; + } + + result[0] = rows; + + sqlite3_reset(st); + + do { + strc = sqlite3_step(st); + + if (strc == SQLITE_DONE) { + break; + } + + if (strc != SQLITE_ROW) { + LOGERR("ERROR: Failed step: %s\n", sqlite3_errmsg(dbconn)); + free(result); + result = NULL; + break; + } + + id = (int64_t) sqlite3_column_int64(st, 0); /* hashes.id */ + result[pos++] = id; + DBGTRC("DEBUG: Found id %ld\n", id); + + } while (strc == SQLITE_ROW); + + sqlite3_reset(st); + + DBGTRC("DEBUG: rows %ld | pos %ld\n", rows, pos); + + return result; +} + + +/** + * Iterate over the stored hashes, for those associated with more than + * one row in fileinfo the information from the latter will be printed. + * + * @param out filestream for output, if NULL stdout is used + * + * @return 0 on success + * <0 on failure + */ +int dbi_print_identical_hashes(FILE *out) { + int rc = 0; + int64_t hid, count, id, i; + int64_t *hid_array; + int strc_fi = 0, strc_count = 0; + FILE *fd = out; + const unsigned char *txt = NULL; + sqlite3_stmt *stfi = select_fileinfo_by_hash_id_resolved, + *stcount = count_fileinfo_by_hash_id; + + DBCONN_CHECK(-1); + + if (fd == NULL) { fd = stdout; } + + /* SQLite only supports one query at a time per connection, therefore the + * segmented approach. Query for all hashes. Iterating over the array and + * query to count the usage of each id, query in case there's more than a + * single association. A second connection is possible, but would require + * another set of query preparation and other surround stuff. + */ + hid_array = dbi_select_hashes_all_ids(); + if (hid_array == NULL) { + return -1; + } + + for (i=1; i1) { + sqlite3_reset(stfi); + sqlite3_clear_bindings(stfi); + + if (sqlite3_bind_int64(stfi, 1, hid) != SQLITE_OK) { + LOGERR("ERROR: Failed to bind hashes.id to prepared statement for count: %s\n", sqlite3_errmsg(dbconn)); + free(hid_array); + return -1; + } + + do { + strc_fi = sqlite3_step(stfi); + + if (strc_fi == SQLITE_DONE) { + DBGTRC("DEBUG: finished for hid %ld\n", hid); + break; + } + + if (strc_fi != SQLITE_ROW) { + LOGERR("ERROR: Failed step to get fileinfo content: %s\n", sqlite3_errmsg(dbconn)); + free(hid_array); + return -1; /* drop-it */ + } + + txt = sqlite3_column_text(stfi, 0); /* paths.pathname */ + fprintf(fd, "%s/", txt); + txt = sqlite3_column_text(stfi, 1); /* filenames.name */ + fprintf(fd, "%s;", txt); + txt = sqlite3_column_text(stfi, 2); /* hashes.blake2 */ + fprintf(fd, "%s;", txt); + txt = sqlite3_column_text(stfi, 3); /* hashes.sha256 */ + fprintf(fd, "%s;", txt); + txt = sqlite3_column_text(stfi, 4); /* hashes.sha512 */ + fprintf(fd, "%s;", txt); + id = (int64_t) sqlite3_column_int64(stfi, 5); /* fileinfo.size */ + fprintf(fd, "%ld\n", id); + } while (strc_fi == SQLITE_ROW); + } + sqlite3_reset(stfi); + sqlite3_clear_bindings(stfi); + } + + free(hid_array); + + sqlite3_clear_bindings(stfi); + sqlite3_clear_bindings(stcount); + sqlite3_reset(stfi); + sqlite3_reset(stcount); + + return rc; +} + + +int dbi_print_identical_filenames(FILE *out); + + + #if 0 *select_fileinfo_by_id, *select_fileinfo_by_id_resolved, diff --git a/src/duplicate_finder.c b/src/duplicate_finder.c index dae7832..8a058fd 100644 --- a/src/duplicate_finder.c +++ b/src/duplicate_finder.c @@ -47,6 +47,14 @@ int analyze_db_content() { /* TODO: Implementation of several SQL queries... not in the mood */ dbi_print_fileinfo_resolved(stdout); + printf("\n\n---- IDENTICAL HASHES ----\n\n"); + + if (dbi_print_identical_hashes(stdout) < 0) { + LOGERR("ERROR: Identification of duplicates via hashes failed.\n"); + dbi_close(); + return EXIT_FAILURE; + } + dbi_close(); return EXIT_SUCCESS; } @@ -96,8 +104,12 @@ int main(int argc, char **argv) { return EXIT_FAILURE; } + DBGTRC("Database: %s\n", option_sqlite_db_name); + DBGTRC("KVStorage: %s\n", option_gdbm_db_name); + + if (option_mode == MODE_SCAN) { - return scan((path_index == argc) ? argv[path_index] : "."); + return scan((path_index == argc) ? "." : argv[path_index] ); } if (option_mode == MODE_ANALYZE_DB) { -- cgit v1.3