/* SPDX-License-Identifier: Apache-2.0 */ /* Copyright 2025 Thorsten Töpper * * It takes quite some time for sort if it's about text files with multiple * gigabytes content. Split the content into buckets and afterwards sort those. * Once done the content of those sorted temporary files can be merged via cat. * * vim:ts=4:sw=4:expandtab */ #include #include #include #include #include #include #include #include #include "output.h" /* Also used for paths => at least PATH_MAX */ #define BUFFERSIZE 4096 /* The arguments will be * PREFIX - file prefix... * SPLIT_LENGTH - the number of characters which will be relevant for file selection * FILES to split */ struct list_node { char *name; struct list_node *next; FILE *fd; }; struct list_head { struct list_node *first; size_t splitlength; size_t length; }; /* DECLARATIONS */ int push_into_list_unique(struct list_head *list, char *name); int split_into_files(struct list_head *list, char *inputfile, char *prefix); size_t list_check_length(struct list_head *list); struct list_head *stdin_handling(struct list_head *list, size_t splitlength, char *prefix); struct list_head *extend_list(struct list_head *list, size_t splitlength, char *fname); struct list_node *get_node(struct list_head *list, char *name); void destroy_list(struct list_head *list); void fflush_list(struct list_head *list); int set_nofile_limit_to_hard(); #ifdef DEBUGBUILD void print_list(struct list_head *list, FILE *out); #endif /* GLOBAL VARIABLES */ int option_append_mode = 0; /* IMPLEMENTATION */ /* modern Linux distributions set the soft limit to 1024 file descriptors, * this may not be sufficient, therefore increase to the hard limit. * */ inline int set_nofile_limit_to_hard() { struct rlimit lim; if (getrlimit(RLIMIT_NOFILE , &lim) != 0) { LOGERR("ERROR: Failed to get RLIMIT_NOFILE: %s (errno %d)\n", strerror(errno), errno); return -1; } lim.rlim_cur = lim.rlim_max; if (setrlimit(RLIMIT_NOFILE , &lim) != 0) { LOGERR("ERROR: Failed to set RLIMIT_NOFILE: %s (errno %d)\n", strerror(errno), errno); return -1; } return 0; } inline void destroy_list(struct list_head *list) { struct list_node *n = NULL, *b = NULL; if (list == NULL) return; n = list->first; while (n != NULL) { b = n->next; if (n->name != NULL) free(n->name); if (n->fd != NULL) fclose(n->fd); free(n); n = b; } free(list); } inline void fflush_list(struct list_head *list) { struct list_node *n = NULL; if (list == NULL) return; n = list->first; while (n != NULL) { if (n->fd != NULL) { fflush(n->fd); } n = n->next; } } #ifdef DEBUGBUILD void print_list(struct list_head *list, FILE *out) { struct list_node *n = NULL; if (list == NULL) return; if (out == NULL) out = stderr; fprintf(out, "length: %lu\n",list_check_length(list)); n = list->first; while (n != NULL) { fprintf(out, "node '%s' %s\n", n->name, ((n->fd != NULL) ? "open file descriptor" : "")); n = n->next; } } #endif inline size_t list_check_length(struct list_head *list) { size_t l = 0; struct list_node *ptr = NULL; if (list == NULL) { LOGERR("ERROR: No list given.\n"); return 0; } ptr = list->first; while (ptr != NULL) { l++; ptr = ptr->next; } list->length = l; return l; } inline int push_into_list_unique(struct list_head *list, char *name) { size_t name_length = 0; struct list_node *ptr = NULL, *tmp = NULL; if (list == NULL || name == NULL || name[0] == '\0') { LOGERR("ERROR: Invalid function arguments.\n"); return -1; } ptr = list->first; while (ptr != NULL) { if (strcmp(ptr->name, name) == 0) { return 0; } ptr = ptr->next; } if ((tmp = calloc(1, sizeof(struct list_node))) == NULL) { LOGERR("ERROR: Failed to allocate memory for new node\n"); return -2; } name_length = strlen(name); if ((tmp->name = calloc(name_length+1, sizeof(char))) == NULL) { LOGERR("ERROR Failed to allocate %lu bytes for data in list\n", strlen(name)+1); free(tmp); return -3; } memcpy(tmp->name, name, name_length); tmp->fd = NULL; tmp->next = list->first; list->first = tmp; list->length++; return 0; } inline struct list_node *get_node(struct list_head *list, char *name) { struct list_node *n = NULL; if (list == NULL || name == NULL || name[0] == '\0') { LOGERR("ERROR: Invalid arguments\n"); return NULL; } n = list->first; while (n != NULL) { if (strcmp(n->name, name) == 0) return n; n = n->next; } return NULL; } /* Open the given file and extend / create a list of output filenames, * based on the read content. * splitlength - the length of a line considered regarding the comparision. */ struct list_head *extend_list(struct list_head *list, size_t splitlength, char *fname) { FILE *fd = NULL; char *line = NULL, *previous = NULL; size_t i = 0, len = 0; if (fname == NULL || fname[0] == '\0' || splitlength == 0) { LOGERR("ERROR: filename or splitlength invalid\n"); return NULL; } if ((fd=fopen(fname, "r")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s\n", fname, strerror(errno)); return NULL; } if ((line = calloc(BUFFERSIZE, sizeof(char))) == NULL) { LOGERR("ERROR: Failed to allocate memory for read buffer\n"); fclose(fd); return NULL; } if ((previous = calloc(BUFFERSIZE, sizeof(char))) == NULL) { LOGERR("ERROR: Failed to allocate memory for read buffer\n"); free(line); fclose(fd); return NULL; } if ((list == NULL) && \ ((list = calloc(1, sizeof(struct list_head))) == NULL)) { LOGERR("ERROR: Failed to create new list"); free(line); fclose(fd); return NULL; } list->splitlength = splitlength; while (fgets(line, BUFFERSIZE, fd) != NULL) { len = strlen(line); len = (splitlength < len) ? splitlength : len; /* most simple way to stick with FS compatible characters */ for (i=0; isplitlength; while (fgets(line, BUFFERSIZE, fdin) != NULL) { strncpy(line_head, line, sl+1); len = strnlen(line_head, sl); len = (lenfd == NULL) { snprintf(filename, BUFFERSIZE, "%s%s", ((prefix==NULL)?"":prefix), node->name); if ((node->fd = fopen(filename, (option_append_mode)?"a":"w")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s\n", filename, strerror(errno)); free(line); free(line_head); free(filename); fclose(fdin); return -4; } } strncpy(prev_head, line_head, sl); } if (fputs(line, node->fd) == EOF) { LOGERR("ERROR: Failed to write into output file for '%s': %s\n", line_head, strerror(errno)); free(line); free(line_head); free(filename); fclose(fdin); return -5; } } fflush_list(list); fclose(fdin); free(line); free(line_head); free(filename); return 0; } /* This covers extend_list and split_into_files for the stdin stream. * The other two functions can't be used as extend_list would read * the data from stdin and split_into_files would have nothing to work with. */ struct list_head *stdin_handling(struct list_head *list, size_t splitlength, char *prefix) { FILE *fdin = stdin; char *filename = NULL, *line = NULL, *line_head = NULL, *previous = NULL; size_t i=0, len = 0; struct list_node *node = NULL; if ((line = calloc(BUFFERSIZE, sizeof(char))) == NULL) { LOGERR("ERROR: Failed to allocate memory for read buffer\n"); return NULL; } if ((line_head = calloc(BUFFERSIZE, sizeof(char))) == NULL) { LOGERR("ERROR: Failed to allocate memory for read buffer\n"); free(line); return NULL; } if ((previous = calloc(BUFFERSIZE, sizeof(char))) == NULL) { LOGERR("ERROR: Failed to allocate memory for read buffer\n"); free(line); free(line_head); return NULL; } if ((filename = calloc(BUFFERSIZE, sizeof(char))) == NULL) { LOGERR("ERROR: Failed to allocate memory for input buffer.\n"); free(line); free(line_head); free(previous); return NULL; } if ((list == NULL) && \ ((list = calloc(1, sizeof(struct list_head))) == NULL)) { LOGERR("ERROR: Failed to create new list"); free(line); free(line_head); free(previous); free(filename); return NULL; } list->splitlength = splitlength; while (fgets(line, BUFFERSIZE, fdin) != NULL) { len = strlen(line); len = (splitlength < len) ? splitlength : len; /* most simple way to stick with FS compatible characters */ for (i=0; ifd == NULL) { snprintf(filename, BUFFERSIZE, "%s%s", ((prefix==NULL)?"":prefix), node->name); if ((node->fd = fopen(filename, (option_append_mode)?"a":"w")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s\n", filename, strerror(errno)); free(line); free(line_head); free(previous); free(filename); destroy_list(list); return NULL; } } strncpy(previous, line_head, splitlength); } if (fputs(line, node->fd) == EOF) { LOGERR("ERROR: Failed to write into output file for '%s': %s\n", line_head, strerror(errno)); free(line); free(line_head); free(previous); free(filename); destroy_list(list); return NULL; } } free(line); free(line_head); free(previous); free(filename); return list; } int main(int argc, char **argv) { int data_index = 2, length_index = 1, opt = 0, output_index = 0; size_t splitlength = 0; struct list_head *list = NULL, *lsttmp = NULL; if (argc < 4) { fprintf(stderr, "Usage: %s prefix length files...\n", argv[0]); fprintf(stderr, "or %s -a prefix length files...\n\n", argv[0]); fprintf(stderr, "-a - don't overwrite existing files, instead append to those\n" "\tprefix - used with the output filenames\n" "\tlength - the number of characters relevant for comparing\n"); return EXIT_FAILURE; } set_nofile_limit_to_hard(); while ((opt = getopt(argc, argv, "a")) != -1) { switch (opt) { case 'a': option_append_mode = 1; break; default: break; }; } data_index += optind; length_index += optind; output_index += optind; splitlength = strtoull(argv[length_index], NULL, 10); if (splitlength == 0) { LOGERR("ERROR: Failed to read valid length from argument '%s' base 10 number >=1 expected\n", argv[length_index]); return EXIT_FAILURE; } for (; data_index