diff options
| -rw-r--r-- | include/database_interaction.h | 2 | ||||
| -rw-r--r-- | src/database_interaction.c | 132 | ||||
| -rw-r--r-- | src/duplicate_finder.c | 9 |
3 files changed, 136 insertions, 7 deletions
diff --git a/include/database_interaction.h b/include/database_interaction.h index 32d76dd..0ceb973 100644 --- a/include/database_interaction.h +++ b/include/database_interaction.h @@ -15,6 +15,7 @@ void dbi_close(); char *dbi_select_filename_by_id(int64_t id); int64_t dbi_select_filename_by_name(const char *name); +int64_t *dbi_select_filenames_all_ids(); int dbi_insert_filename(const char *filename); char *dbi_select_path_by_id(int64_t id); @@ -22,6 +23,7 @@ int64_t dbi_select_path_by_pathname(const char *pathname); int dbi_insert_pathname(const char *path); struct df_hashstrings *dbi_select_hashes_by_id(int64_t id); +int64_t *dbi_select_hashes_all_ids(); int64_t dbi_select_hashes_by_strings(const char *blake2, const char *sha256, const char *sha512); int dbi_insert_hashes(const char *blake2, const char *sha256, const char *sha512); diff --git a/src/database_interaction.c b/src/database_interaction.c index e9e0b86..3acc81c 100644 --- a/src/database_interaction.c +++ b/src/database_interaction.c @@ -53,6 +53,7 @@ sqlite3 *dbconn = NULL; /* The statements will be wrapped via a function dbi_STATEMENTNAME() for the outside */ sqlite3_stmt *select_filename_by_id, *select_filename_by_name, + *select_filename_all_ids, *select_filename_complete_table, *select_path_by_id, *select_path_by_pathname, @@ -64,6 +65,7 @@ sqlite3_stmt *select_filename_by_id, *select_fileinfo_by_id_resolved, *select_fileinfo_by_path_id, *select_fileinfo_by_filename_id, + *select_fileinfo_by_filename_id_resolved, *select_fileinfo_by_path_filename_ids, *select_fileinfo_by_hash_path_filename_ids, *select_fileinfo_by_hash_id, @@ -84,6 +86,7 @@ sqlite3_stmt *delete_fileinfo_by_id; sqlite3_stmt *count_fileinfo_by_hash_id, *count_fileinfo_by_filename, + *count_filenames, *count_hashes, *count_fileinfo; @@ -92,6 +95,7 @@ void create_tables(); int prepare_statements(); char *select_string_by_int(sqlite3_stmt *st, int64_t id); int64_t call_count_query(sqlite3_stmt *st); +int64_t *call_select_all_ids(sqlite3_stmt *all_ids, sqlite3_stmt *count_query); /* Writing this block way too often */ #define DBCONN_CHECK(x) \ @@ -133,6 +137,7 @@ void dbi_close() { DBCONN_CHECK(); + LOCAL_FINALIZE(select_filename_all_ids); LOCAL_FINALIZE(select_filename_by_id); LOCAL_FINALIZE(select_filename_by_name); LOCAL_FINALIZE(select_filename_complete_table); @@ -146,6 +151,7 @@ void dbi_close() { LOCAL_FINALIZE(select_fileinfo_by_id_resolved); LOCAL_FINALIZE(select_fileinfo_by_path_id); LOCAL_FINALIZE(select_fileinfo_by_filename_id); + LOCAL_FINALIZE(select_fileinfo_by_filename_id_resolved); LOCAL_FINALIZE(select_fileinfo_by_path_filename_ids); LOCAL_FINALIZE(select_fileinfo_by_hash_id); LOCAL_FINALIZE(select_fileinfo_by_hash_id_resolved); @@ -166,6 +172,7 @@ void dbi_close() { LOCAL_FINALIZE(count_fileinfo); LOCAL_FINALIZE(count_fileinfo_by_hash_id); LOCAL_FINALIZE(count_fileinfo_by_filename); + LOCAL_FINALIZE(count_filenames); LOCAL_FINALIZE(count_hashes); #undef LOCAL_FINALIZE @@ -224,6 +231,8 @@ int prepare_statements() { /* SELECT */ LOCAL_PREP_STMT("SELECT name FROM filenames WHERE id = ? ;", &select_filename_by_id); LOCAL_PREP_STMT("SELECT id FROM filenames WHERE name = ? ;", &select_filename_by_name); + LOCAL_PREP_STMT("SELECT id FROM filenames ;", &select_filename_all_ids); + LOCAL_PREP_STMT("SELECT * FROM filenames;", &select_filename_complete_table); LOCAL_PREP_STMT("SELECT pathname FROM paths WHERE id = ? ;", &select_path_by_id); LOCAL_PREP_STMT("SELECT id FROM paths WHERE pathname = ? ;", &select_path_by_pathname); @@ -231,6 +240,7 @@ int prepare_statements() { LOCAL_PREP_STMT("SELECT hashes.id FROM hashes ;", &select_hashes_all_ids); LOCAL_PREP_STMT("SELECT blake2, sha256, sha512 FROM hashes WHERE id = ? ;", &select_hashes_by_id); LOCAL_PREP_STMT("SELECT id FROM hashes WHERE blake2 = ? AND sha256 = ? AND sha512 = ? ;", &select_hashes_by_strings); + LOCAL_PREP_STMT("SELECT * FROM hashes;", &select_hashes_complete_table); LOCAL_PREP_STMT("SELECT * FROM fileinfo WHERE id = ? ;", &select_fileinfo_by_id); @@ -247,10 +257,9 @@ int prepare_statements() { LOCAL_PREP_STMT("SELECT paths.pathname, filenames.name, hashes.blake2, hashes.sha256, hashes.sha512, fileinfo.size, fileinfo.last_seen, fileinfo.stat_struct FROM fileinfo INNER JOIN paths ON fileinfo.p_id = paths.id INNER JOIN filenames ON fileinfo.fn_id = filenames.id INNER JOIN hashes ON fileinfo.h_id = hashes.id WHERE fileinfo.id = ? ;", &select_fileinfo_by_id_resolved); LOCAL_PREP_STMT("SELECT paths.pathname, filenames.name, hashes.blake2, hashes.sha256, hashes.sha512, fileinfo.size, fileinfo.last_seen, fileinfo.stat_struct FROM fileinfo INNER JOIN paths ON fileinfo.p_id = paths.id INNER JOIN filenames ON fileinfo.fn_id = filenames.id INNER JOIN hashes ON fileinfo.h_id = hashes.id ;", &select_fileinfo_complete_table_resolved); LOCAL_PREP_STMT("SELECT paths.pathname, filenames.name, hashes.blake2, hashes.sha256, hashes.sha512, fileinfo.size, fileinfo.last_seen, fileinfo.stat_struct FROM fileinfo INNER JOIN paths ON fileinfo.p_id = paths.id INNER JOIN filenames ON fileinfo.fn_id = filenames.id INNER JOIN hashes ON fileinfo.h_id = hashes.id WHERE fileinfo.h_id = ?;", &select_fileinfo_by_hash_id_resolved); + LOCAL_PREP_STMT("SELECT paths.pathname, filenames.name, hashes.blake2, hashes.sha256, hashes.sha512, fileinfo.size, fileinfo.last_seen, fileinfo.stat_struct FROM fileinfo INNER JOIN paths ON fileinfo.p_id = paths.id INNER JOIN filenames ON fileinfo.fn_id = filenames.id INNER JOIN hashes ON fileinfo.h_id = hashes.id WHERE fileinfo.fn_id = ?;", &select_fileinfo_by_filename_id_resolved); LOCAL_PREP_STMT("SELECT p_id, fn_id, h_id, size, last_seen, stat_struct FROM fileinfo ;", &select_fileinfo_complete_table); - LOCAL_PREP_STMT("SELECT * FROM filenames;", &select_filename_complete_table); - LOCAL_PREP_STMT("SELECT * FROM hashes;", &select_hashes_complete_table); /* INSERT */ LOCAL_PREP_STMT("INSERT INTO filenames (name) VALUES (?);", &insert_filename); @@ -272,6 +281,7 @@ int prepare_statements() { LOCAL_PREP_STMT("SELECT COUNT(fileinfo.h_id) FROM fileinfo WHERE fileinfo.h_id = ?;", &count_fileinfo_by_hash_id); LOCAL_PREP_STMT("SELECT COUNT(fileinfo.fn_id) FROM fileinfo WHERE fileinfo.fn_id = ?;", &count_fileinfo_by_filename); LOCAL_PREP_STMT("SELECT COUNT(hashes.id) FROM hashes ;", &count_hashes); + LOCAL_PREP_STMT("SELECT COUNT(filenames.id) FROM filenames ;", &count_filenames); #undef LOCAL_PREP_STMT return 0; @@ -990,21 +1000,21 @@ inline int64_t call_count_query(sqlite3_stmt *st) { } /** - * Get an array containing all ids from table hashes with the first field + * Get an array containing all ids from table with the first field * containing the complete length of the array including this field. * * @return NULL on failure * an array on the heap which must be freed by the caller. */ -int64_t *dbi_select_hashes_all_ids() { +inline int64_t *call_select_all_ids(sqlite3_stmt *all_ids, sqlite3_stmt *count_query) { int64_t *result = NULL; int64_t rows = 0, id = 1, pos = 1; int strc = 0; - sqlite3_stmt *st = select_hashes_all_ids; + sqlite3_stmt *st = all_ids; DBCONN_CHECK(NULL); - rows = call_count_query(count_hashes); + rows = call_count_query(count_query); if (rows < 0) { return NULL; } @@ -1048,6 +1058,14 @@ int64_t *dbi_select_hashes_all_ids() { return result; } +int64_t *dbi_select_filenames_all_ids() { + return call_select_all_ids(select_filename_all_ids, count_filenames); +} + + +int64_t *dbi_select_hashes_all_ids() { + return call_select_all_ids(select_hashes_all_ids, count_hashes); +} /** * Iterate over the stored hashes, for those associated with more than @@ -1161,7 +1179,107 @@ int dbi_print_identical_hashes(FILE *out) { } -int dbi_print_identical_filenames(FILE *out); +int dbi_print_identical_filenames(FILE *out) { + int rc = 0; + int64_t fnid, count, id, i; + int64_t *fnid_array; + int strc_fi = 0, strc_count = 0; + FILE *fd = out; + const unsigned char *txt = NULL; + sqlite3_stmt *stfi = select_fileinfo_by_filename_id_resolved, + *stcount = count_fileinfo_by_filename; + + DBCONN_CHECK(-1); + + if (fd == NULL) { fd = stdout; } + + /* SQLite only supports one query at a time per connection, therefore the + * segmented approach. Query for all hashes. Iterating over the array and + * query to count the usage of each id, query in case there's more than a + * single association. A second connection is possible, but would require + * another set of query preparation and other surround stuff. + */ + fnid_array = dbi_select_filenames_all_ids(); + if (fnid_array == NULL) { + return -1; + } + + for (i=1; i<fnid_array[0]; i++) { + sqlite3_clear_bindings(stcount); + sqlite3_reset(stcount); + + /* prevent human errors */ + fnid = fnid_array[i]; + + if (sqlite3_bind_int64(stcount, 1, fnid) != SQLITE_OK) { + LOGERR("ERROR: Failed to bind hashes.id to prepared statement for count: %s\n", sqlite3_errmsg(dbconn)); + rc = -1; + break; + } + + strc_count = sqlite3_step(stcount); + if (strc_count == SQLITE_DONE) { /* Not found */ + continue; + } else if (strc_count != SQLITE_ROW) { + LOGERR("ERROR: Failed step: %s\n", sqlite3_errmsg(dbconn)); + rc = -1; + break; + } + + count = (int64_t) sqlite3_column_int64(stcount, 0); + DBGTRC("DEBUG: count results for hash id %ld: %ld\n", fnid, count); + + if (count>1) { + sqlite3_reset(stfi); + sqlite3_clear_bindings(stfi); + + if (sqlite3_bind_int64(stfi, 1, fnid) != SQLITE_OK) { + LOGERR("ERROR: Failed to bind hashes.id to prepared statement for count: %s\n", sqlite3_errmsg(dbconn)); + free(fnid_array); + return -1; + } + + do { + strc_fi = sqlite3_step(stfi); + + if (strc_fi == SQLITE_DONE) { + DBGTRC("DEBUG: finished for hid %ld\n", fnid); + break; + } + + if (strc_fi != SQLITE_ROW) { + LOGERR("ERROR: Failed step to get fileinfo content: %s\n", sqlite3_errmsg(dbconn)); + free(fnid_array); + return -1; /* drop-it */ + } + + txt = sqlite3_column_text(stfi, 0); /* paths.pathname */ + fprintf(fd, "%s/", txt); + txt = sqlite3_column_text(stfi, 1); /* filenames.name */ + fprintf(fd, "%s;", txt); + txt = sqlite3_column_text(stfi, 2); /* hashes.blake2 */ + fprintf(fd, "%s;", txt); + txt = sqlite3_column_text(stfi, 3); /* hashes.sha256 */ + fprintf(fd, "%s;", txt); + txt = sqlite3_column_text(stfi, 4); /* hashes.sha512 */ + fprintf(fd, "%s;", txt); + id = (int64_t) sqlite3_column_int64(stfi, 5); /* fileinfo.size */ + fprintf(fd, "%ld\n", id); + } while (strc_fi == SQLITE_ROW); + } + sqlite3_reset(stfi); + sqlite3_clear_bindings(stfi); + } + + free(fnid_array); + + sqlite3_clear_bindings(stfi); + sqlite3_clear_bindings(stcount); + sqlite3_reset(stfi); + sqlite3_reset(stcount); + + return rc; +} diff --git a/src/duplicate_finder.c b/src/duplicate_finder.c index 8a058fd..f410f59 100644 --- a/src/duplicate_finder.c +++ b/src/duplicate_finder.c @@ -55,6 +55,15 @@ int analyze_db_content() { return EXIT_FAILURE; } + printf("\n\n---- IDENTICAL FILENAMES ----\n\n"); + + if (dbi_print_identical_filenames(stdout) < 0) { + LOGERR("ERROR: Identification of duplicates via filenames failed.\n"); + dbi_close(); + return EXIT_FAILURE; + } + + dbi_close(); return EXIT_SUCCESS; } |
