/* SPDX-License-Identifier: Apache-2.0 */ /* Copyright 2025 Thorsten Töpper * * Compare two text files remove from one the lines which exist * in the other one. * * Read file into memory in form of bucket lists, afterwards move * the content into a single large array to be sorted via qsort. * afterwards read the other file and print the lines not found by * bsearch. * * vim:ts=4:sw=4:expandtab */ #include #include #include #include #include #include #include #include #include #include #include #include #include "trace_macros.h" #include "time_utils.h" #ifndef LINELENGTH #define LINELENGTH 4096 #endif #ifndef BUCKET_SIZE /* X pointers array + pointer to next bucket + current element */ #define BUCKET_SIZE 4094 #endif #define LISTS 32 struct data_array { char **data; size_t items; }; struct data_bucket { struct data_bucket *next; size_t curpos; char *content[BUCKET_SIZE]; }; /* Performance optimization: distribute the content by length across * multiple lists. */ struct bucket_collection { struct data_bucket *bucket[LISTS]; size_t items[LISTS]; }; struct data_array *create_array(size_t items); struct data_bucket *create_bucket(void); struct bucket_collection *create_bucket_collection(void); void destroy_bucket_collection(struct bucket_collection *bc); void destroy_bucket_list(struct data_bucket *db); void destroy_data_array(struct data_array *da); bool filter_file(FILE *fdin, FILE *fdout, struct data_array *da); bool insert_into_collection(struct bucket_collection *bc, char *str); size_t items_in_list(struct data_bucket *db); struct data_array *move_buckets_into_array(struct bucket_collection *bc); struct bucket_collection *read_data_from_stream(FILE *fdin, struct bucket_collection *oldbc); bool search_in_array(char *str, struct data_array *da); void set_items_in_collection(struct bucket_collection *bc); int sort_comp(const void *a, const void *b); void sort_data_array(char *data[], size_t items); /* === IMPLEMENTATION === */ int sort_comp(const void *a, const void *b) { /* casting is dark magic */ char * const *A = a; char * const *B = b; return strcmp(*A,*B); } inline void sort_data_array(char *data[], size_t items) { if (data == NULL) { return; } qsort(data, items, sizeof(char*), sort_comp); } inline bool search_in_array(char *str, struct data_array *da) { char *cptr = NULL; if (str == NULL || da == NULL) { LOGERR("ERROR: Missing argument%s%s.\n", (str==NULL) ? " str" : "", (da==NULL) ? " data" : ""); return false; } cptr = bsearch(&str, &(da->data), da->items, sizeof(char*), sort_comp); return (cptr==NULL) ? false : true ; } inline struct data_array *create_array(size_t items) { struct data_array *da = NULL; if (items == 0) { LOGERR("ERROR: can't create an array size 0\n"); return NULL; } if ((da=calloc(1, sizeof(struct data_array))) == NULL) { LOGERR("ERROR: Failed to allocate memory for struct data_array\n"); return NULL; } da->items = items; if ((da->data = calloc(items, sizeof(char*))) == NULL) { LOGERR("ERROR: Failed to allocate pointer array for %lu items.\n", items); free(da); return NULL; } return da; } inline struct data_bucket *create_bucket(void) { struct data_bucket *b = NULL; size_t i = 0; if ((b = calloc(1, sizeof(struct data_bucket))) == NULL) { LOGERR("ERROR: Failed to allocate memory for new bucket\n"); return NULL; } b->curpos = 0; b->next = NULL; for (i=0; icontent[i] = NULL; } return b; } inline struct bucket_collection *create_bucket_collection(void) { struct bucket_collection *bc = NULL; /* size_t i = 0; */ if ((bc = calloc(1, sizeof(struct bucket_collection))) == NULL) { LOGERR("ERROR: Failed to allocate memory for new bucket collection\n"); return NULL; } /* Paranoid? uncomment for (i=0; ibucket[i] = NULL; bc->items[i] = 0; } */ return bc; } inline void destroy_data_array(struct data_array *da) { size_t i = 0; if (da == NULL) { return; } for (;iitems; i++) { if (da->data[i] != NULL) free(da->data[i]); } free(da); } inline void destroy_bucket_list(struct data_bucket *db) { size_t p = 0; struct data_bucket *ptr = db, *tmp; while (ptr != NULL) { for (p=0; pcontent[p] != NULL) { free(ptr->content[p]); } } tmp = ptr; ptr = ptr->next; free(tmp); } free(ptr); } inline void destroy_bucket_collection(struct bucket_collection *bc) { size_t p = 0; if (bc == NULL) return; for (p=0; pbucket[p] == NULL) { continue; } destroy_bucket_list(bc->bucket[p]); } free(bc); } inline size_t items_in_list(struct data_bucket *db) { size_t s = 0; struct data_bucket *ptr = db; while (ptr != NULL) { s += ptr->curpos; ptr = ptr->next; } return s; } void set_items_in_collection(struct bucket_collection *bc) { int i = 0; if (bc == NULL) { return; } for (i=0; iitems[i] = items_in_list(bc->bucket[i]); } } bool insert_into_collection(struct bucket_collection *bc, char *str) { struct data_bucket *ptr = NULL; size_t len = 0, lst = 0; if (str == NULL || ((len=strlen(str)) == 0) ) { return false; } lst = len % LISTS; ptr = bc->bucket[lst]; if (ptr == NULL) { if ((bc->bucket[lst] = create_bucket()) == NULL) { return false; } ptr = bc->bucket[lst]; } while (ptr != NULL) { /* bucket full? */ if (ptr->curpos == BUCKET_SIZE) { if (ptr->next == NULL) { if ((ptr->next = create_bucket()) == NULL) { return false; } } ptr = ptr->next; continue; } /* space in bucket */ ptr->content[ptr->curpos++] = str; return true; } LOGERR("ERROR: Insert of '%s' into collection failed this code should never be reached\n", str); return false; } struct data_array *move_buckets_into_array(struct bucket_collection *bc) { struct data_array *array = NULL; size_t total_items = 0, i = 0, j = 0, p = 0; struct data_bucket *ptr = NULL; if (bc == NULL) { LOGERR("ERROR: No collection given.\n"); return NULL; } set_items_in_collection(bc); for (i=0; iitems[i]; } if ((array = create_array(total_items)) == NULL) { return NULL; } for (i=0; ibucket[i]; while (ptr != NULL) { for (j=0; jcurpos; j++, p++) { array->data[p] = ptr->content[j]; ptr->content[j] = NULL; } ptr = ptr->next; } } return array; } /* Complete input stream management outside */ struct bucket_collection *read_data_from_stream(FILE *fdin, struct bucket_collection *oldbc) { struct bucket_collection *bc = oldbc; char line[LINELENGTH], *str=NULL; size_t len = 0; if (fdin == NULL) { LOGERR("ERROR: No input stream given.\n"); return NULL; } if (bc == NULL) { if ( (bc = create_bucket_collection()) == NULL ) { return NULL; } } while (fgets(line, LINELENGTH, fdin) != NULL) { len = strlen(line); if ((str = calloc(len+1, sizeof(char))) == NULL) { LOGERR("ERROR: Failed to allocate %lu bytes memory for string %s", len, line); /* keep the line break in the string */ destroy_bucket_collection(bc); return NULL; } strcpy(str,line); if (!insert_into_collection(bc,str)) { LOGERR("ERROR: abort due to failed insertion of %s", str); free(str); destroy_bucket_collection(bc); return NULL; } } return bc; } inline bool filter_file(FILE *fdin, FILE *fdout, struct data_array *da) { char line[LINELENGTH]; if (fdin == NULL || da == NULL) { LOGERR("ERROR: Missing argument%s%s%s.\n", (fdin==NULL) ? " input stream" : "", (fdout==NULL) ? " output stream" : "", (da==NULL) ? " data" : ""); return false; } while (fgets(line, LINELENGTH, fdin) != NULL) { if ( ! search_in_array(line, da) ) { fputs(line, fdout); } } return true; } #define usage(x) \ fprintf(stderr, "Usage: %s output filter_file data_file...\n", x); int main(int argc, char **argv) { FILE *output = NULL, *input = NULL;; int data_index = 2, opt = 0, filter_index = 1, output_index = 0; size_t s = 0; struct bucket_collection *bc = NULL; struct data_array *array = NULL; struct timespec t1, t2, tdiff; if (argc < 3) { usage(argv[0]); return EXIT_FAILURE; } while ((opt = getopt(argc, argv, "h")) != -1) { switch (opt) { case 'h': usage(argv[0]); return EXIT_SUCCESS; default: LOGERR("ERROR: Unknown option %c.\n", opt); exit(EXIT_FAILURE); }; } output_index += optind; filter_index += optind; data_index += optind; DBGTRC("DEBUG: filter index %d / i %d / optind %d / output_index %d\n", filter_index, data_index, optind, output_index); DBGTRC("DEBUG: output_file %s\n", argv[output_index]); DBGTRC("DEBUG: filter_file %s\n", argv[filter_index]); DBGTRC("DEBUG: first data %s\n", argv[data_index]); s = strlen(argv[output_index]); if (s == 6 && (strncmp("stdout", argv[1], 6) == 0)) { output = stdout; } else { if ((output=fopen(argv[output_index], "w")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", argv[output_index], strerror(errno), errno); return EXIT_FAILURE; } } LOGERR("IMPORT FILE %s\n", argv[filter_index]); if ((input = fopen(argv[filter_index], "r")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", argv[filter_index], strerror(errno), errno); return EXIT_FAILURE; } bc = read_data_from_stream(input, bc); fclose(input); input=NULL; if (bc == NULL) { return EXIT_FAILURE; } array = move_buckets_into_array(bc); destroy_bucket_collection(bc); bc = NULL; LOGERR("run qsort on in-memory data\n"); TU_MEASURE_TIME( CLOCK_PROCESS_CPUTIME_ID, &t1, &t2, sort_data_array(array->data, array->items); ); difftime_timespec(t1, t2, &tdiff); LOGERR("Sorted in %lu s and %lu ns process CPU time\n", tdiff.tv_sec, tdiff.tv_nsec ); for (; data_index