diff options
| author | Thorsten Töpper <atsutane@freethoughts.de> | 2025-09-11 22:05:07 +0200 |
|---|---|---|
| committer | Thorsten Töpper <atsutane@freethoughts.de> | 2025-09-11 22:05:07 +0200 |
| commit | 5a611f358080d5e870db907e85fed3dc913a1f91 (patch) | |
| tree | 727b733048365d1a1ebd8aa37366538eff615a74 /src | |
| parent | ae3b1829f9731662b5e2b626655a619733101209 (diff) | |
| download | small-utils-5a611f358080d5e870db907e85fed3dc913a1f91.tar.gz small-utils-5a611f358080d5e870db907e85fed3dc913a1f91.tar.bz2 | |
remove_existing_lines: mem_internal_check for varying strings
Diffstat (limited to 'src')
| -rw-r--r-- | src/mem_internal_check.c | 2 | ||||
| -rw-r--r-- | src/remove_existing_lines.c | 465 |
2 files changed, 466 insertions, 1 deletions
diff --git a/src/mem_internal_check.c b/src/mem_internal_check.c index bda270e..c9d38f8 100644 --- a/src/mem_internal_check.c +++ b/src/mem_internal_check.c @@ -500,7 +500,7 @@ int main(int argc, char **argv) { } else { if ((output=fopen(argv[output_index], "w")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", - argv[1], strerror(errno), errno); + argv[output_index], strerror(errno), errno); return EXIT_FAILURE; } } diff --git a/src/remove_existing_lines.c b/src/remove_existing_lines.c new file mode 100644 index 0000000..270dd4c --- /dev/null +++ b/src/remove_existing_lines.c @@ -0,0 +1,465 @@ +/* SPDX-License-Identifier: Apache-2.0 */ + +/* Copyright 2025 Thorsten Töpper + * + * Compare two text files remove from one the lines which exist + * in the other one. + * + * Read file into memory in form of bucket lists, afterwards move + * the content into a single large array to be sorted via qsort. + * afterwards read the other file and print the lines not found by + * bsearch. + * + * vim:ts=4:sw=4:expandtab + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <stdbool.h> +#include <stdint.h> +#include <errno.h> +#include <ctype.h> +#include <inttypes.h> +#include <sys/mman.h> +#include <sys/stat.h> + +#include "trace_macros.h" +#include "time_utils.h" + + +#ifndef LINELENGTH +#define LINELENGTH 4096 +#endif + +#ifndef BUCKET_SIZE +/* X pointers array + pointer to next bucket + current element */ +#define BUCKET_SIZE 4094 +#endif + +#define LISTS 32 + + +struct data_array { + char **data; + size_t items; +}; + +struct data_bucket { + struct data_bucket *next; + size_t curpos; + char *content[BUCKET_SIZE]; +}; + +/* Performance optimization: distribute the content by length across + * multiple lists. + */ +struct bucket_collection { + struct data_bucket *bucket[LISTS]; + size_t items[LISTS]; +}; + + +struct data_array *create_array(size_t items); +struct data_bucket *create_bucket(void); +struct bucket_collection *create_bucket_collection(void); +void destroy_bucket_collection(struct bucket_collection *bc); +void destroy_bucket_list(struct data_bucket *db); +void destroy_data_array(struct data_array *da); +bool filter_file(FILE *fdin, FILE *fdout, struct data_array *da); +bool insert_into_collection(struct bucket_collection *bc, char *str); +size_t items_in_list(struct data_bucket *db); +struct data_array *move_buckets_into_array(struct bucket_collection *bc); +struct bucket_collection *read_data_from_stream(FILE *fdin, struct bucket_collection *oldbc); +bool search_in_array(char *str, struct data_array *da); +void set_items_in_collection(struct bucket_collection *bc); +int sort_comp(const void *a, const void *b); +void sort_data_array(char *data[], size_t items); + + +/* === IMPLEMENTATION === */ + +int sort_comp(const void *a, const void *b) { + /* casting is dark magic */ + char * const *A = a; + char * const *B = b; + return strcmp(*A,*B); +} + + +inline void sort_data_array(char *data[], size_t items) { + if (data == NULL) { return; } + qsort(data, items, sizeof(char*), sort_comp); +} + + +inline bool search_in_array(char *str, struct data_array *da) { + char *cptr = NULL; + + if (str == NULL || da == NULL) { + LOGERR("ERROR: Missing argument%s%s.\n", + (str==NULL) ? " str" : "", + (da==NULL) ? " data" : ""); + return false; + } + + cptr = bsearch(&str, &(da->data), da->items, sizeof(char*), sort_comp); + + return (cptr==NULL) ? false : true ; +} + + +inline struct data_array *create_array(size_t items) { + struct data_array *da = NULL; + if (items == 0) { + LOGERR("ERROR: can't create an array size 0\n"); + return NULL; + } + + if ((da=calloc(1, sizeof(struct data_array))) == NULL) { + LOGERR("ERROR: Failed to allocate memory for struct data_array\n"); + return NULL; + } + + da->items = items; + + if ((da->data = calloc(items, sizeof(char*))) == NULL) { + LOGERR("ERROR: Failed to allocate pointer array for %lu items.\n", + items); + free(da); + return NULL; + } + + return da; +} + + +inline struct data_bucket *create_bucket(void) { + struct data_bucket *b = NULL; + size_t i = 0; + + if ((b = calloc(1, sizeof(struct data_bucket))) == NULL) { + LOGERR("ERROR: Failed to allocate memory for new bucket\n"); + return NULL; + } + + b->curpos = 0; + b->next = NULL; + for (i=0; i<BUCKET_SIZE; i++) { + b->content[i] = NULL; + } + return b; +} + + +inline struct bucket_collection *create_bucket_collection(void) { + struct bucket_collection *bc = NULL; + /* size_t i = 0; */ + + if ((bc = calloc(1, sizeof(struct bucket_collection))) == NULL) { + LOGERR("ERROR: Failed to allocate memory for new bucket collection\n"); + return NULL; + } + /* Paranoid? uncomment + for (i=0; i<BUCKET_SIZE; i++) { + bc->bucket[i] = NULL; + bc->items[i] = 0; + } + */ + return bc; +} + + +inline void destroy_data_array(struct data_array *da) { + size_t i = 0; + if (da == NULL) { return; } + for (;i<da->items; i++) { + if (da->data[i] != NULL) + free(da->data[i]); + } + free(da); +} + + +inline void destroy_bucket_list(struct data_bucket *db) { + size_t p = 0; + struct data_bucket *ptr = db, *tmp; + + while (ptr != NULL) { + for (p=0; p<BUCKET_SIZE; p++) { + if (ptr->content[p] != NULL) { + free(ptr->content[p]); + } + } + tmp = ptr; + ptr = ptr->next; + free(tmp); + } + free(ptr); +} + + +inline void destroy_bucket_collection(struct bucket_collection *bc) { + size_t p = 0; + if (bc == NULL) return; + for (p=0; p<LISTS; p++) { + if (bc->bucket[p] == NULL) { continue; } + destroy_bucket_list(bc->bucket[p]); + } + free(bc); +} + + +inline size_t items_in_list(struct data_bucket *db) { + size_t s = 0; + struct data_bucket *ptr = db; + while (ptr != NULL) { + s += ptr->curpos; + ptr = ptr->next; + } + return s; +} + + +void set_items_in_collection(struct bucket_collection *bc) { + int i = 0; + if (bc == NULL) { return; } + for (i=0; i<LISTS; i++) { + bc->items[i] = items_in_list(bc->bucket[i]); + } +} + + +bool insert_into_collection(struct bucket_collection *bc, char *str) { + struct data_bucket *ptr = NULL; + size_t len = 0, lst = 0; + + if (str == NULL || ((len=strlen(str)) == 0) ) { + return false; + } + + lst = len % LISTS; + ptr = bc->bucket[lst]; + + if (ptr == NULL) { + if ((bc->bucket[lst] = create_bucket()) == NULL) { + return false; + } + ptr = bc->bucket[lst]; + } + + while (ptr != NULL) { + /* bucket full? */ + if (ptr->curpos == BUCKET_SIZE) { + if (ptr->next == NULL) { + if ((ptr->next = create_bucket()) == NULL) { + return false; + } + } + ptr = ptr->next; + continue; + } + /* space in bucket */ + ptr->content[ptr->curpos++] = str; + return true; + } + + LOGERR("ERROR: Insert of '%s' into collection failed this code should never be reached\n", str); + return false; +} + + +struct data_array *move_buckets_into_array(struct bucket_collection *bc) { + struct data_array *array = NULL; + size_t total_items = 0, i = 0, j = 0, p = 0; + struct data_bucket *ptr = NULL; + + if (bc == NULL) { + LOGERR("ERROR: No collection given.\n"); + return NULL; + } + + set_items_in_collection(bc); + for (i=0; i<LISTS; i++) { + total_items += bc->items[i]; + } + + if ((array = create_array(total_items)) == NULL) { + return NULL; + } + + for (i=0; i<LISTS; i++) { + ptr = bc->bucket[i]; + while (ptr != NULL) { + for (j=0; j<ptr->curpos; j++, p++) { + array->data[p] = ptr->content[j]; + ptr->content[j] = NULL; + } + ptr = ptr->next; + } + } + + return array; +} + +/* Complete input stream management outside */ +struct bucket_collection *read_data_from_stream(FILE *fdin, struct bucket_collection *oldbc) { + struct bucket_collection *bc = oldbc; + char line[LINELENGTH], *str=NULL; + size_t len = 0; + + if (fdin == NULL) { + LOGERR("ERROR: No input stream given.\n"); + return NULL; + } + + if (bc == NULL) { + if ( (bc = create_bucket_collection()) == NULL ) { + return NULL; + } + } + + while (fgets(line, LINELENGTH, fdin) != NULL) { + len = strlen(line); + if ((str = calloc(len+1, sizeof(char))) == NULL) { + LOGERR("ERROR: Failed to allocate %lu bytes memory for string %s", + len, line); /* keep the line break in the string */ + destroy_bucket_collection(bc); + return NULL; + } + strcpy(str,line); + if (!insert_into_collection(bc,str)) { + LOGERR("ERROR: abort due to failed insertion of %s", str); + free(str); + destroy_bucket_collection(bc); + return NULL; + } + } + + return bc; +} + + +inline bool filter_file(FILE *fdin, FILE *fdout, struct data_array *da) { + char line[LINELENGTH]; + + if (fdin == NULL || da == NULL) { + LOGERR("ERROR: Missing argument%s%s%s.\n", + (fdin==NULL) ? " input stream" : "", + (fdout==NULL) ? " output stream" : "", + (da==NULL) ? " data" : ""); + return false; + } + + while (fgets(line, LINELENGTH, fdin) != NULL) { + if ( ! search_in_array(line, da) ) { + fputs(line, fdout); + } + } + + return true; +} + + + +#define usage(x) \ + fprintf(stderr, "Usage: %s output filter_file data_file...\n", x); + +int main(int argc, char **argv) { + FILE *output = NULL, *input = NULL;; + int data_index = 2, opt = 0, filter_index = 1, output_index = 0; + size_t s = 0; + struct bucket_collection *bc = NULL; + struct data_array *array = NULL; + struct timespec t1, t2, tdiff; + + if (argc < 3) { + usage(argv[0]); + return EXIT_FAILURE; + } + + + while ((opt = getopt(argc, argv, "h")) != -1) { + switch (opt) { + case 'h': + usage(argv[0]); + return EXIT_SUCCESS; + default: + LOGERR("ERROR: Unknown option %c.\n", opt); + exit(EXIT_FAILURE); + }; + } + + output_index += optind; + filter_index += optind; + data_index += optind; + + DBGTRC("DEBUG: filter index %d / i %d / optind %d / output_index %d\n", + filter_index, data_index, optind, output_index); + DBGTRC("DEBUG: output_file %s\n", argv[output_index]); + DBGTRC("DEBUG: filter_file %s\n", argv[filter_index]); + DBGTRC("DEBUG: first data %s\n", argv[data_index]); + + + s = strlen(argv[output_index]); + if (s == 6 && (strncmp("stdout", argv[1], 6) == 0)) { + output = stdout; + } else { + if ((output=fopen(argv[output_index], "w")) == NULL) { + LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", + argv[output_index], strerror(errno), errno); + return EXIT_FAILURE; + } + } + + LOGERR("IMPORT FILE %s\n", argv[filter_index]); + if ((input = fopen(argv[filter_index], "r")) == NULL) { + LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", + argv[filter_index], strerror(errno), errno); + return EXIT_FAILURE; + } + bc = read_data_from_stream(input, bc); + fclose(input); + input=NULL; + if (bc == NULL) { + return EXIT_FAILURE; + } + + array = move_buckets_into_array(bc); + destroy_bucket_collection(bc); + bc = NULL; + + LOGERR("run qsort on in-memory data\n"); + TU_MEASURE_TIME( CLOCK_PROCESS_CPUTIME_ID, &t1, &t2, + sort_data_array(array->data, array->items); + ); + difftime_timespec(t1, t2, &tdiff); + LOGERR("Sorted in %lu s and %lu ns process CPU time\n", tdiff.tv_sec, tdiff.tv_nsec ); + + + + for (; data_index<argc; data_index++) { + LOGERR("APPLY FILTER ON FILE %s\n", argv[data_index]); + if ((argv[data_index][0] == '-' && argv[data_index][1] == '\0') || + (strncmp(argv[data_index], "stdin",7) == 0)) { + input = stdin; + } else { + if ((input = fopen(argv[data_index], "r")) == NULL) { + LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", + argv[data_index], strerror(errno), errno); + return EXIT_FAILURE; + } + } + + filter_file(input, output, array); + + if (input != stdin) + fclose(input); + } + fflush(output); + fclose(output); + + return EXIT_SUCCESS; +} + |
