aboutsummaryrefslogtreecommitdiff
path: root/src/remove_existing_lines.c
diff options
context:
space:
mode:
authorThorsten Töpper <atsutane@freethoughts.de>2025-09-11 22:05:07 +0200
committerThorsten Töpper <atsutane@freethoughts.de>2025-09-11 22:05:07 +0200
commit5a611f358080d5e870db907e85fed3dc913a1f91 (patch)
tree727b733048365d1a1ebd8aa37366538eff615a74 /src/remove_existing_lines.c
parentae3b1829f9731662b5e2b626655a619733101209 (diff)
downloadsmall-utils-5a611f358080d5e870db907e85fed3dc913a1f91.tar.gz
small-utils-5a611f358080d5e870db907e85fed3dc913a1f91.tar.bz2
remove_existing_lines: mem_internal_check for varying strings
Diffstat (limited to 'src/remove_existing_lines.c')
-rw-r--r--src/remove_existing_lines.c465
1 files changed, 465 insertions, 0 deletions
diff --git a/src/remove_existing_lines.c b/src/remove_existing_lines.c
new file mode 100644
index 0000000..270dd4c
--- /dev/null
+++ b/src/remove_existing_lines.c
@@ -0,0 +1,465 @@
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/* Copyright 2025 Thorsten Töpper
+ *
+ * Compare two text files remove from one the lines which exist
+ * in the other one.
+ *
+ * Read file into memory in form of bucket lists, afterwards move
+ * the content into a single large array to be sorted via qsort.
+ * afterwards read the other file and print the lines not found by
+ * bsearch.
+ *
+ * vim:ts=4:sw=4:expandtab
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <errno.h>
+#include <ctype.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include "trace_macros.h"
+#include "time_utils.h"
+
+
+#ifndef LINELENGTH
+#define LINELENGTH 4096
+#endif
+
+#ifndef BUCKET_SIZE
+/* X pointers array + pointer to next bucket + current element */
+#define BUCKET_SIZE 4094
+#endif
+
+#define LISTS 32
+
+
+struct data_array {
+ char **data;
+ size_t items;
+};
+
+struct data_bucket {
+ struct data_bucket *next;
+ size_t curpos;
+ char *content[BUCKET_SIZE];
+};
+
+/* Performance optimization: distribute the content by length across
+ * multiple lists.
+ */
+struct bucket_collection {
+ struct data_bucket *bucket[LISTS];
+ size_t items[LISTS];
+};
+
+
+struct data_array *create_array(size_t items);
+struct data_bucket *create_bucket(void);
+struct bucket_collection *create_bucket_collection(void);
+void destroy_bucket_collection(struct bucket_collection *bc);
+void destroy_bucket_list(struct data_bucket *db);
+void destroy_data_array(struct data_array *da);
+bool filter_file(FILE *fdin, FILE *fdout, struct data_array *da);
+bool insert_into_collection(struct bucket_collection *bc, char *str);
+size_t items_in_list(struct data_bucket *db);
+struct data_array *move_buckets_into_array(struct bucket_collection *bc);
+struct bucket_collection *read_data_from_stream(FILE *fdin, struct bucket_collection *oldbc);
+bool search_in_array(char *str, struct data_array *da);
+void set_items_in_collection(struct bucket_collection *bc);
+int sort_comp(const void *a, const void *b);
+void sort_data_array(char *data[], size_t items);
+
+
+/* === IMPLEMENTATION === */
+
+int sort_comp(const void *a, const void *b) {
+ /* casting is dark magic */
+ char * const *A = a;
+ char * const *B = b;
+ return strcmp(*A,*B);
+}
+
+
+inline void sort_data_array(char *data[], size_t items) {
+ if (data == NULL) { return; }
+ qsort(data, items, sizeof(char*), sort_comp);
+}
+
+
+inline bool search_in_array(char *str, struct data_array *da) {
+ char *cptr = NULL;
+
+ if (str == NULL || da == NULL) {
+ LOGERR("ERROR: Missing argument%s%s.\n",
+ (str==NULL) ? " str" : "",
+ (da==NULL) ? " data" : "");
+ return false;
+ }
+
+ cptr = bsearch(&str, &(da->data), da->items, sizeof(char*), sort_comp);
+
+ return (cptr==NULL) ? false : true ;
+}
+
+
+inline struct data_array *create_array(size_t items) {
+ struct data_array *da = NULL;
+ if (items == 0) {
+ LOGERR("ERROR: can't create an array size 0\n");
+ return NULL;
+ }
+
+ if ((da=calloc(1, sizeof(struct data_array))) == NULL) {
+ LOGERR("ERROR: Failed to allocate memory for struct data_array\n");
+ return NULL;
+ }
+
+ da->items = items;
+
+ if ((da->data = calloc(items, sizeof(char*))) == NULL) {
+ LOGERR("ERROR: Failed to allocate pointer array for %lu items.\n",
+ items);
+ free(da);
+ return NULL;
+ }
+
+ return da;
+}
+
+
+inline struct data_bucket *create_bucket(void) {
+ struct data_bucket *b = NULL;
+ size_t i = 0;
+
+ if ((b = calloc(1, sizeof(struct data_bucket))) == NULL) {
+ LOGERR("ERROR: Failed to allocate memory for new bucket\n");
+ return NULL;
+ }
+
+ b->curpos = 0;
+ b->next = NULL;
+ for (i=0; i<BUCKET_SIZE; i++) {
+ b->content[i] = NULL;
+ }
+ return b;
+}
+
+
+inline struct bucket_collection *create_bucket_collection(void) {
+ struct bucket_collection *bc = NULL;
+ /* size_t i = 0; */
+
+ if ((bc = calloc(1, sizeof(struct bucket_collection))) == NULL) {
+ LOGERR("ERROR: Failed to allocate memory for new bucket collection\n");
+ return NULL;
+ }
+ /* Paranoid? uncomment
+ for (i=0; i<BUCKET_SIZE; i++) {
+ bc->bucket[i] = NULL;
+ bc->items[i] = 0;
+ }
+ */
+ return bc;
+}
+
+
+inline void destroy_data_array(struct data_array *da) {
+ size_t i = 0;
+ if (da == NULL) { return; }
+ for (;i<da->items; i++) {
+ if (da->data[i] != NULL)
+ free(da->data[i]);
+ }
+ free(da);
+}
+
+
+inline void destroy_bucket_list(struct data_bucket *db) {
+ size_t p = 0;
+ struct data_bucket *ptr = db, *tmp;
+
+ while (ptr != NULL) {
+ for (p=0; p<BUCKET_SIZE; p++) {
+ if (ptr->content[p] != NULL) {
+ free(ptr->content[p]);
+ }
+ }
+ tmp = ptr;
+ ptr = ptr->next;
+ free(tmp);
+ }
+ free(ptr);
+}
+
+
+inline void destroy_bucket_collection(struct bucket_collection *bc) {
+ size_t p = 0;
+ if (bc == NULL) return;
+ for (p=0; p<LISTS; p++) {
+ if (bc->bucket[p] == NULL) { continue; }
+ destroy_bucket_list(bc->bucket[p]);
+ }
+ free(bc);
+}
+
+
+inline size_t items_in_list(struct data_bucket *db) {
+ size_t s = 0;
+ struct data_bucket *ptr = db;
+ while (ptr != NULL) {
+ s += ptr->curpos;
+ ptr = ptr->next;
+ }
+ return s;
+}
+
+
+void set_items_in_collection(struct bucket_collection *bc) {
+ int i = 0;
+ if (bc == NULL) { return; }
+ for (i=0; i<LISTS; i++) {
+ bc->items[i] = items_in_list(bc->bucket[i]);
+ }
+}
+
+
+bool insert_into_collection(struct bucket_collection *bc, char *str) {
+ struct data_bucket *ptr = NULL;
+ size_t len = 0, lst = 0;
+
+ if (str == NULL || ((len=strlen(str)) == 0) ) {
+ return false;
+ }
+
+ lst = len % LISTS;
+ ptr = bc->bucket[lst];
+
+ if (ptr == NULL) {
+ if ((bc->bucket[lst] = create_bucket()) == NULL) {
+ return false;
+ }
+ ptr = bc->bucket[lst];
+ }
+
+ while (ptr != NULL) {
+ /* bucket full? */
+ if (ptr->curpos == BUCKET_SIZE) {
+ if (ptr->next == NULL) {
+ if ((ptr->next = create_bucket()) == NULL) {
+ return false;
+ }
+ }
+ ptr = ptr->next;
+ continue;
+ }
+ /* space in bucket */
+ ptr->content[ptr->curpos++] = str;
+ return true;
+ }
+
+ LOGERR("ERROR: Insert of '%s' into collection failed this code should never be reached\n", str);
+ return false;
+}
+
+
+struct data_array *move_buckets_into_array(struct bucket_collection *bc) {
+ struct data_array *array = NULL;
+ size_t total_items = 0, i = 0, j = 0, p = 0;
+ struct data_bucket *ptr = NULL;
+
+ if (bc == NULL) {
+ LOGERR("ERROR: No collection given.\n");
+ return NULL;
+ }
+
+ set_items_in_collection(bc);
+ for (i=0; i<LISTS; i++) {
+ total_items += bc->items[i];
+ }
+
+ if ((array = create_array(total_items)) == NULL) {
+ return NULL;
+ }
+
+ for (i=0; i<LISTS; i++) {
+ ptr = bc->bucket[i];
+ while (ptr != NULL) {
+ for (j=0; j<ptr->curpos; j++, p++) {
+ array->data[p] = ptr->content[j];
+ ptr->content[j] = NULL;
+ }
+ ptr = ptr->next;
+ }
+ }
+
+ return array;
+}
+
+/* Complete input stream management outside */
+struct bucket_collection *read_data_from_stream(FILE *fdin, struct bucket_collection *oldbc) {
+ struct bucket_collection *bc = oldbc;
+ char line[LINELENGTH], *str=NULL;
+ size_t len = 0;
+
+ if (fdin == NULL) {
+ LOGERR("ERROR: No input stream given.\n");
+ return NULL;
+ }
+
+ if (bc == NULL) {
+ if ( (bc = create_bucket_collection()) == NULL ) {
+ return NULL;
+ }
+ }
+
+ while (fgets(line, LINELENGTH, fdin) != NULL) {
+ len = strlen(line);
+ if ((str = calloc(len+1, sizeof(char))) == NULL) {
+ LOGERR("ERROR: Failed to allocate %lu bytes memory for string %s",
+ len, line); /* keep the line break in the string */
+ destroy_bucket_collection(bc);
+ return NULL;
+ }
+ strcpy(str,line);
+ if (!insert_into_collection(bc,str)) {
+ LOGERR("ERROR: abort due to failed insertion of %s", str);
+ free(str);
+ destroy_bucket_collection(bc);
+ return NULL;
+ }
+ }
+
+ return bc;
+}
+
+
+inline bool filter_file(FILE *fdin, FILE *fdout, struct data_array *da) {
+ char line[LINELENGTH];
+
+ if (fdin == NULL || da == NULL) {
+ LOGERR("ERROR: Missing argument%s%s%s.\n",
+ (fdin==NULL) ? " input stream" : "",
+ (fdout==NULL) ? " output stream" : "",
+ (da==NULL) ? " data" : "");
+ return false;
+ }
+
+ while (fgets(line, LINELENGTH, fdin) != NULL) {
+ if ( ! search_in_array(line, da) ) {
+ fputs(line, fdout);
+ }
+ }
+
+ return true;
+}
+
+
+
+#define usage(x) \
+ fprintf(stderr, "Usage: %s output filter_file data_file...\n", x);
+
+int main(int argc, char **argv) {
+ FILE *output = NULL, *input = NULL;;
+ int data_index = 2, opt = 0, filter_index = 1, output_index = 0;
+ size_t s = 0;
+ struct bucket_collection *bc = NULL;
+ struct data_array *array = NULL;
+ struct timespec t1, t2, tdiff;
+
+ if (argc < 3) {
+ usage(argv[0]);
+ return EXIT_FAILURE;
+ }
+
+
+ while ((opt = getopt(argc, argv, "h")) != -1) {
+ switch (opt) {
+ case 'h':
+ usage(argv[0]);
+ return EXIT_SUCCESS;
+ default:
+ LOGERR("ERROR: Unknown option %c.\n", opt);
+ exit(EXIT_FAILURE);
+ };
+ }
+
+ output_index += optind;
+ filter_index += optind;
+ data_index += optind;
+
+ DBGTRC("DEBUG: filter index %d / i %d / optind %d / output_index %d\n",
+ filter_index, data_index, optind, output_index);
+ DBGTRC("DEBUG: output_file %s\n", argv[output_index]);
+ DBGTRC("DEBUG: filter_file %s\n", argv[filter_index]);
+ DBGTRC("DEBUG: first data %s\n", argv[data_index]);
+
+
+ s = strlen(argv[output_index]);
+ if (s == 6 && (strncmp("stdout", argv[1], 6) == 0)) {
+ output = stdout;
+ } else {
+ if ((output=fopen(argv[output_index], "w")) == NULL) {
+ LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n",
+ argv[output_index], strerror(errno), errno);
+ return EXIT_FAILURE;
+ }
+ }
+
+ LOGERR("IMPORT FILE %s\n", argv[filter_index]);
+ if ((input = fopen(argv[filter_index], "r")) == NULL) {
+ LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n",
+ argv[filter_index], strerror(errno), errno);
+ return EXIT_FAILURE;
+ }
+ bc = read_data_from_stream(input, bc);
+ fclose(input);
+ input=NULL;
+ if (bc == NULL) {
+ return EXIT_FAILURE;
+ }
+
+ array = move_buckets_into_array(bc);
+ destroy_bucket_collection(bc);
+ bc = NULL;
+
+ LOGERR("run qsort on in-memory data\n");
+ TU_MEASURE_TIME( CLOCK_PROCESS_CPUTIME_ID, &t1, &t2,
+ sort_data_array(array->data, array->items);
+ );
+ difftime_timespec(t1, t2, &tdiff);
+ LOGERR("Sorted in %lu s and %lu ns process CPU time\n", tdiff.tv_sec, tdiff.tv_nsec );
+
+
+
+ for (; data_index<argc; data_index++) {
+ LOGERR("APPLY FILTER ON FILE %s\n", argv[data_index]);
+ if ((argv[data_index][0] == '-' && argv[data_index][1] == '\0') ||
+ (strncmp(argv[data_index], "stdin",7) == 0)) {
+ input = stdin;
+ } else {
+ if ((input = fopen(argv[data_index], "r")) == NULL) {
+ LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n",
+ argv[data_index], strerror(errno), errno);
+ return EXIT_FAILURE;
+ }
+ }
+
+ filter_file(input, output, array);
+
+ if (input != stdin)
+ fclose(input);
+ }
+ fflush(output);
+ fclose(output);
+
+ return EXIT_SUCCESS;
+}
+