/* SPDX-License-Identifier: Apache-2.0 */ /* Copyright 2025 Thorsten Töpper * * It takes quite some time for sort if it's about text files with multiple * gigabytes content. Split the content into buckets and afterwards sort those. * Once done the content of those sorted temporary files can be merged via cat. * * vim:ts=4:sw=4:expandtab */ #include #include #include #include #include #include #include #include "output.h" /* Also used for paths => at least PATH_MAX */ #define BUFFERSIZE 4096 /* The arguments will be * PREFIX - file prefix... * SPLIT_LENGTH - the number of characters which will be relevant for file selection * FILES to split */ struct list_node { char *name; struct list_node *next; FILE *fd; }; struct list_head { struct list_node *first; size_t splitlength; size_t length; }; int push_into_list_unique(struct list_head *list, char *name); int split_into_files(struct list_head *list, char *inputfile, char *prefix); size_t list_check_length(struct list_head *list); struct list_head *extend_list(struct list_head *list, size_t splitlength, char *fname); struct list_node *get_node(struct list_head *list, char *name); void destroy_list(struct list_head *list); void fflush_list(struct list_head *list); void print_list(struct list_head *list, FILE *out); int set_nofile_limit_to_hard(); int set_nofile_limit_to_hard() { struct rlimit lim; if (getrlimit(RLIMIT_NOFILE , &lim) != 0) { LOGERR("ERROR: Failed to get RLIMIT_NOFILE: %s (errno %d)\n", strerror(errno), errno); return -1; } lim.rlim_cur = lim.rlim_max; if (setrlimit(RLIMIT_NOFILE , &lim) != 0) { LOGERR("ERROR: Failed to set RLIMIT_NOFILE: %s (errno %d)\n", strerror(errno), errno); return -1; } return 0; } void destroy_list(struct list_head *list) { struct list_node *n = NULL, *b = NULL; if (list == NULL) return; n = list->first; while (n != NULL) { b = n->next; if (n->name != NULL) free(n->name); if (n->fd != NULL) fclose(n->fd); free(n); n = b; } free(list); } void fflush_list(struct list_head *list) { struct list_node *n = NULL; if (list == NULL) return; n = list->first; while (n != NULL) { if (n->fd != NULL) { fflush(n->fd); } n = n->next; } } void print_list(struct list_head *list, FILE *out) { struct list_node *n = NULL; if (list == NULL) return; if (out == NULL) out = stderr; fprintf(out, "length: %lu\n",list_check_length(list)); n = list->first; while (n != NULL) { fprintf(out, "node '%s' %s\n", n->name, ((n->fd != NULL) ? "open file descriptor" : "")); n = n->next; } } inline size_t list_check_length(struct list_head *list) { size_t l = 0; struct list_node *ptr = NULL; if (list == NULL) { LOGERR("ERROR: No list given.\n"); return 0; } ptr = list->first; while (ptr != NULL) { l++; ptr = ptr->next; } list->length = l; return l; } inline int push_into_list_unique(struct list_head *list, char *name) { size_t name_length = 0; struct list_node *ptr = NULL, *tmp = NULL; if (list == NULL || name == NULL || name[0] == '\0') { LOGERR("ERROR: Invalid function arguments.\n"); return -1; } ptr = list->first; while (ptr != NULL) { if (strcmp(ptr->name, name) == 0) { return 0; } ptr = ptr->next; } if ((tmp = calloc(1, sizeof(struct list_node))) == NULL) { LOGERR("ERROR: Failed to allocate memory for new node\n"); return -2; } name_length = strlen(name); if ((tmp->name = calloc(name_length+1, sizeof(char))) == NULL) { LOGERR("ERROR Failed to allocate %lu bytes for data in list\n", strlen(name)+1); free(tmp); return -3; } memcpy(tmp->name, name, name_length); tmp->fd = NULL; tmp->next = list->first; list->first = tmp; list->length++; return 0; } inline struct list_node *get_node(struct list_head *list, char *name) { struct list_node *n = NULL; if (list == NULL || name == NULL || name[0] == '\0') { LOGERR("ERROR: Invalid arguments\n"); return NULL; } n = list->first; while (n != NULL) { if (strcmp(n->name, name) == 0) return n; n = n->next; } return NULL; } /* Open the given file and extend / create a list of output filenames, * based on the read content. * splitlength - the length of a line considered regarding the comparision. */ struct list_head *extend_list(struct list_head *list, size_t splitlength, char *fname) { FILE *fd = NULL; char *line = NULL, *previous = NULL; size_t i = 0, len = 0; if (fname == NULL || fname[0] == '\0' || splitlength == 0) { LOGERR("ERROR: filename or splitlength invalid\n"); return NULL; } if ((fd=fopen(fname, "r")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s\n", fname, strerror(errno)); return NULL; } if ((line = calloc(BUFFERSIZE, sizeof(char))) == NULL) { LOGERR("ERROR: Failed to allocate memory for read buffer\n"); fclose(fd); return NULL; } if ((previous = calloc(BUFFERSIZE, sizeof(char))) == NULL) { LOGERR("ERROR: Failed to allocate memory for read buffer\n"); free(line); fclose(fd); return NULL; } if ((list == NULL) && \ ((list = calloc(1, sizeof(struct list_head))) == NULL)) { LOGERR("ERROR: Failed to create new list"); free(line); fclose(fd); return NULL; } list->splitlength = splitlength; while (fgets(line, BUFFERSIZE, fd) != NULL) { len = strlen(line); len = (splitlength < len) ? splitlength : len; /* most simple way to stick with FS compatible characters */ for (i=0; isplitlength; while (fgets(line, BUFFERSIZE, fdin) != NULL) { strncpy(line_head, line, sl+1); len = strnlen(line_head, sl); len = (lenfd == NULL) { snprintf(filename, BUFFERSIZE, "%s%s", ((prefix==NULL)?"":prefix), node->name); if ((node->fd = fopen(filename, "w")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s\n", filename, strerror(errno)); free(line); free(line_head); free(filename); fclose(fdin); return -4; } } strncpy(prev_head, line_head, sl); } if (fputs(line, node->fd) == EOF) { LOGERR("ERROR: Failed to write into output file for '%s': %s\n", line_head, strerror(errno)); free(line); free(line_head); free(filename); fclose(fdin); return -5; } } fflush_list(list); fclose(fdin); free(line); free(line_head); free(filename); return 0; } int main(int argc, char **argv) { int i = 3; size_t splitlength = 0; struct list_head *list = NULL; if (argc < 4) { fprintf(stderr, "Usage: %s prefix length files...\n\n", argv[0]); fprintf(stderr, "\tprefix - used with the output filenames\n" "\tlength - the number of characters relevant for comparing\n"); return EXIT_FAILURE; } /* modern Linux distributions set the soft limit to 1024 file descriptors, * this may not be sufficient, therefore increase to the hard limit */ set_nofile_limit_to_hard(); splitlength = strtoull(argv[2], NULL, 10); if (splitlength == 0) { LOGERR("ERROR: Failed to read valid length from argument '%s' base 10 number >=1 expected\n", argv[2]); return EXIT_FAILURE; } for (i=3; i Scan '%s' for bucket filenames\n", argv[i]); list = extend_list(list, splitlength, argv[i]); // fprintf(stderr,"=> Split '%s' into bucket files\n", argv[i]); if (split_into_files(list, argv[i], argv[1]) < 0) { destroy_list(list); return EXIT_FAILURE; } } destroy_list(list); return EXIT_SUCCESS; }