aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorThorsten Töpper <atsutane@freethoughts.de>2025-08-10 18:16:07 +0200
committerThorsten Töpper <atsutane@freethoughts.de>2025-08-10 18:16:07 +0200
commit9e2f3d59cf249403859916df9756c179753ea7e0 (patch)
tree6aaacfd22fc681fb7d95826ef65726c392cfc7d8 /src
parent5b743929d23ca0e8004fe2d6bc8ff5c04ed9dbb9 (diff)
downloadsmall-utils-9e2f3d59cf249403859916df9756c179753ea7e0.tar.gz
small-utils-9e2f3d59cf249403859916df9756c179753ea7e0.tar.bz2
split_for_sort: Split a given file into buckets
The target bucket is decided based on the first X characters of a line. The bucket name gets a prefix defined as argument and can be sorted faster on weak hardware. Note: This is just a split alternative. Real world usage in a shell script with a file in which the first 10 characters are the equal in each line, the following 2 bytes are evaluated for splitting: split_for_sort TMPSFS 12 raw_data.txt for f in TMPSFS ; do sort -o "${f}_sorted" -u "${f}" done \# Rely on the argument resolution to go with lexical order cat TMPSFS*_sorted > sorted_data.txt rm TMPSFS*
Diffstat (limited to 'src')
-rw-r--r--src/split_for_sort.c367
1 files changed, 367 insertions, 0 deletions
diff --git a/src/split_for_sort.c b/src/split_for_sort.c
new file mode 100644
index 0000000..8f5d8b8
--- /dev/null
+++ b/src/split_for_sort.c
@@ -0,0 +1,367 @@
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/* Copyright 2025 Thorsten Töpper
+ *
+ * It takes quite some time for sort if it's about text files with multiple
+ * gigabytes content. Split the content into buckets and afterwards sort those.
+ * Once done the content of those sorted temporary files can be merged via cat.
+ *
+ * vim:ts=4:sw=4:expandtab
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <errno.h>
+#include <sys/stat.h>
+
+#include "output.h"
+
+/* Also used for paths => at least PATH_MAX */
+#define BUFFERSIZE 4096
+
+/* The arguments will be
+ * PREFIX - file prefix...
+ * SPLIT_LENGTH - the number of characters which will be relevant for file selection
+ * FILES to split
+ */
+struct list_node {
+ char *name;
+ struct list_node *next;
+ FILE *fd;
+};
+
+struct list_head {
+ struct list_node *first;
+ size_t splitlength;
+ size_t length;
+};
+
+int push_into_list_unique(struct list_head *list, char *name);
+int split_into_files(struct list_head *list, char *inputfile, char *prefix);
+size_t list_check_length(struct list_head *list);
+struct list_head *extend_list(struct list_head *list, size_t splitlength, char *fname);
+struct list_node *get_node(struct list_head *list, char *name);
+void destroy_list(struct list_head *list);
+void fflush_list(struct list_head *list);
+void print_list(struct list_head *list, FILE *out);
+
+void destroy_list(struct list_head *list) {
+ struct list_node *n = NULL, *b = NULL;
+ if (list == NULL)
+ return;
+ n = list->first;
+ while (n != NULL) {
+ b = n->next;
+ if (n->name != NULL) free(n->name);
+ if (n->fd != NULL) fclose(n->fd);
+ free(n);
+ n = b;
+ }
+ free(list);
+}
+
+void fflush_list(struct list_head *list) {
+ struct list_node *n = NULL;
+ if (list == NULL) return;
+ n = list->first;
+ while (n != NULL) {
+ if (n->fd != NULL) {
+ fflush(n->fd);
+ }
+ n = n->next;
+ }
+}
+
+void print_list(struct list_head *list, FILE *out) {
+ struct list_node *n = NULL;
+ if (list == NULL)
+ return;
+ if (out == NULL) out = stderr;
+ fprintf(out, "length: %lu\n",list_check_length(list));
+ n = list->first;
+ while (n != NULL) {
+ fprintf(out, "node '%s' %s\n", n->name,
+ ((n->fd != NULL) ? "open file descriptor" : ""));
+ n = n->next;
+ }
+}
+
+inline size_t list_check_length(struct list_head *list) {
+ size_t l = 0;
+ struct list_node *ptr = NULL;
+ if (list == NULL) {
+ LOGERR("ERROR: No list given.\n");
+ return 0;
+ }
+ ptr = list->first;
+ while (ptr != NULL) {
+ l++;
+ ptr = ptr->next;
+ }
+ list->length = l;
+ return l;
+}
+
+inline int push_into_list_unique(struct list_head *list, char *name) {
+ size_t name_length = 0; /* required for __builtin___strncpy_chk */
+ struct list_node *ptr = NULL, *tmp = NULL;
+ if (list == NULL || name == NULL || name[0] == '\0') {
+ LOGERR("ERROR: Invalid function arguments.\n");
+ return -1;
+ }
+ ptr = list->first;
+ while (ptr != NULL) {
+ if (strcmp(ptr->name, name) == 0) {
+ return 0;
+ }
+ ptr = ptr->next;
+ }
+
+ if ((tmp = calloc(1, sizeof(struct list_node))) == NULL) {
+ LOGERR("ERROR: Failed to allocate memory for new node\n");
+ return -2;
+ }
+
+ name_length = strlen(name);
+ if ((tmp->name = calloc(name_length+2, sizeof(char))) == NULL) {
+ LOGERR("ERROR Failed to allocate %lu bytes for data in list\n", strlen(name)+2);
+ free(tmp);
+ return -3;
+ }
+ strncpy(tmp->name, name, name_length+1);
+ tmp->fd = NULL;
+ tmp->next = list->first;
+ list->first = tmp;
+ list->length++;
+ return 0;
+}
+
+inline struct list_node *get_node(struct list_head *list, char *name) {
+ struct list_node *n = NULL;
+ if (list == NULL || name == NULL || name[0] == '\0') {
+ LOGERR("ERROR: Invalid arguments\n");
+ return NULL;
+ }
+
+ n = list->first;
+ while (n != NULL) {
+ if (strcmp(n->name, name) == 0)
+ return n;
+ n = n->next;
+ }
+
+ return NULL;
+}
+
+/* Open the given file and extend / create a list of output filenames,
+ * based on the read content.
+ * splitlength - the length of a line considered regarding the comparision.
+ */
+struct list_head *extend_list(struct list_head *list, size_t splitlength, char *fname) {
+ FILE *fd = NULL;
+ char *line = NULL, *previous = NULL;
+ size_t i = 0, len = 0;
+
+ if (fname == NULL || fname[0] == '\0' || splitlength == 0) {
+ LOGERR("ERROR: filename or splitlength invalid\n");
+ return NULL;
+ }
+
+ if ((fd=fopen(fname, "r")) == NULL) {
+ LOGERR("ERROR: Failed to open file '%s': %s\n", fname, strerror(errno));
+ return NULL;
+ }
+
+ if ((line = calloc(BUFFERSIZE, sizeof(char))) == NULL) {
+ LOGERR("ERROR: Failed to allocate memory for read buffer\n");
+ fclose(fd);
+ return NULL;
+ }
+
+ if ((previous = calloc(BUFFERSIZE, sizeof(char))) == NULL) {
+ LOGERR("ERROR: Failed to allocate memory for read buffer\n");
+ free(line);
+ fclose(fd);
+ return NULL;
+ }
+
+ if ((list == NULL) && \
+ ((list = calloc(1, sizeof(struct list_head))) == NULL)) {
+ LOGERR("ERROR: Failed to create new list");
+ free(line);
+ fclose(fd);
+ return NULL;
+ }
+
+ list->splitlength = splitlength;
+
+ while (fgets(line, BUFFERSIZE, fd) != NULL) {
+ len = strlen(line);
+ len = (splitlength < len) ? splitlength : len;
+ /* most simple way to stick with FS compatible characters */
+ for (i=0; i<len; i++) {
+ if (! isalnum(line[i]) ) {
+ line[i] = '_';
+ }
+ }
+ line[len] = '\0';
+ if (strncmp(line, previous, splitlength) != 0) {
+ if (push_into_list_unique(list, line) != 0) {
+ LOGERR("WARNING: Failed to insert '%s' into list.\n", line);
+ continue;
+ }
+ strncpy(previous, line, splitlength);
+ }
+ }
+
+ fclose(fd);
+ free(line);
+ free(previous);
+
+ return list;
+}
+
+/* Parse the inputfile and split it's content into the given list of targets
+ * with the prefix for the name of the target.
+ */
+int split_into_files(struct list_head *list, char *inputfile, char *prefix) {
+ FILE *fdin = NULL;
+ char *line = NULL, *filename = NULL, *line_head = NULL, *prev_head = NULL;
+ size_t i = 0, len=0, sl = 0;
+ struct list_node *node = NULL;
+
+
+ if (inputfile == NULL || strlen(inputfile) == 0) {
+ LOGERR("ERROR: no valid filename for input.\n");
+ return -1;
+ }
+ if (list == NULL) {
+ LOGERR("ERROR: No list regarding output files given\n");
+ return -1;
+ }
+
+ if ((line = calloc(BUFFERSIZE, sizeof(char))) == NULL) {
+ LOGERR("ERROR: Failed to allocate memory for input buffer.\n");
+ return -2;
+ }
+ if ((line_head = calloc(BUFFERSIZE, sizeof(char))) == NULL) {
+ LOGERR("ERROR: Failed to allocate memory for input buffer.\n");
+ free(line);
+ return -2;
+ }
+ if ((prev_head = calloc(BUFFERSIZE, sizeof(char))) == NULL) {
+ LOGERR("ERROR: Failed to allocate memory for input buffer.\n");
+ free(line);
+ free(line_head);
+ return -2;
+ }
+ if ((filename = calloc(BUFFERSIZE, sizeof(char))) == NULL) {
+ LOGERR("ERROR: Failed to allocate memory for input buffer.\n");
+ free(line);
+ free(line_head);
+ free(prev_head);
+ return -2;
+ }
+
+ if ((fdin = fopen(inputfile, "r")) == NULL) {
+ LOGERR("ERROR: Failed to open file '%s' to read: %s\n",
+ inputfile, strerror(errno));
+ free(filename);
+ free(line);
+ free(line_head);
+ free(prev_head);
+ return -3;
+ }
+
+ /* Optical attraction: work action happens below */
+ sl = list->splitlength;
+ while (fgets(line, BUFFERSIZE, fdin) != NULL) {
+ strncpy(line_head, line, sl+1);
+ len = strnlen(line_head, sl);
+ len = (len<sl) ? len : sl;
+ for (i=0; i<len; i++) {
+ if (! isalnum(line_head[i]) ) {
+ line_head[i] = '_';
+ }
+ }
+ line_head[len] = '\0';
+
+ /* Gain performance if some files are partly sorted */
+ if (strcmp(line_head, prev_head) != 0) {
+ node = get_node(list, line_head);
+ if (node == NULL) {
+ LOGERR("WARNING: no node found for %s - skip line\n", line_head);
+ continue;
+ }
+
+ /* keep the file descriptors open across the input files to be split */
+ if (node->fd == NULL) {
+ snprintf(filename, BUFFERSIZE, "%s%s", ((prefix==NULL)?"":prefix), node->name);
+ if ((node->fd = fopen(filename, "w")) == NULL) {
+ LOGERR("ERROR: Failed to open file '%s': %s\n", filename, strerror(errno));
+ free(line);
+ free(line_head);
+ free(filename);
+ fclose(fdin);
+ return -4;
+ }
+ }
+ strncpy(prev_head, line_head, sl);
+ }
+
+ if (fputs(line, node->fd) == EOF) {
+ LOGERR("ERROR: Failed to write into output file for '%s': %s\n",
+ line_head, strerror(errno));
+ free(line);
+ free(line_head);
+ free(filename);
+ fclose(fdin);
+ return -5;
+ }
+ }
+
+ fflush_list(list);
+ fclose(fdin);
+ free(line);
+ free(line_head);
+ free(filename);
+ return 0;
+}
+
+int main(int argc, char **argv) {
+ int i = 3;
+ size_t splitlength = 0;
+ struct list_head *list = NULL;
+
+ if (argc < 4) {
+ fprintf(stderr, "Usage: %s prefix length files...\n\n", argv[0]);
+ fprintf(stderr, "\tprefix - used with the output filenames\n"
+ "\tlength - the number of characters relevant for comparing\n");
+ return EXIT_FAILURE;
+ }
+
+ splitlength = strtoull(argv[2], NULL, 10);
+ if (splitlength == 0) {
+ LOGERR("ERROR: Failed to read valid length from argument '%s' base 10 number >=1 expected\n", argv[2]);
+ return EXIT_FAILURE;
+ }
+
+ for (i=3; i<argc; i++) {
+// fprintf(stderr, "=> Scan '%s' for bucket filenames\n", argv[i]);
+ list = extend_list(list, splitlength, argv[i]);
+ }
+
+ for (i=3; i<argc; i++) {
+// fprintf(stderr,"=> Split '%s' into bucket files\n", argv[i]);
+ if (split_into_files(list, argv[i], argv[1]) < 0) {
+ destroy_list(list);
+ return EXIT_FAILURE;
+ }
+ }
+
+ destroy_list(list);
+
+ return EXIT_SUCCESS;
+}
+