split_for_sort: Split a given file into buckets

The target bucket is decided based on the first X characters of a line. The bucket name gets a prefix defined as argument and can be sorted faster on weak hardware. Note: This is just a split alternative. Real world usage in a shell script with a file in which the first 10 characters are the equal in each line, the following 2 bytes are evaluated for splitting: split_for_sort TMPSFS 12 raw_data.txt for f in TMPSFS ; do sort -o "${f}_sorted" -u "${f}" done \# Rely on the argument resolution to go with lexical order cat TMPSFS*_sorted > sorted_data.txt rm TMPSFS*
author: Thorsten Töpper <atsutane@freethoughts.de> 2025-08-10 18:16:07 +0200
committer: Thorsten Töpper <atsutane@freethoughts.de> 2025-08-10 18:16:07 +0200
commit: 9e2f3d59cf249403859916df9756c179753ea7e0 (patch)
tree: 6aaacfd22fc681fb7d95826ef65726c392cfc7d8
parent: 5b743929d23ca0e8004fe2d6bc8ff5c04ed9dbb9 (diff)
download: small-utils-9e2f3d59cf249403859916df9756c179753ea7e0.tar.gz
small-utils-9e2f3d59cf249403859916df9756c179753ea7e0.tar.bz2
5 files changed, 519 insertions, 1 deletions
diff --git a/.gitignore b/.gitignore
index c82077c..0047e56 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
-bin/*
+out/*
 man/*.1
 man/*.gz
 .gdbinit
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..4d8a55d
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,35 @@
+
+
+CFLAGS += -Wall -Wextra -Wformat -Wformat=2 -Wconversion -Wimplicit-fallthrough -Werror=format-security \
+	-Wbidi-chars=any -Werror=implicit -Werror=incompatible-pointer-types -Werror=int-conversion \
+	-fno-delete-null-pointer-checks -fstack-clash-protection -fstack-protector-strong -fstrict-flex-arrays=3
+
+DEBUG_CFLAGS = -DDEBUGBUILD -g -Werror -fsanitize=address
+
+PROD_CFLAGS = -O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3 -FPIE -pie -Wl,-z,noexecstack -Wl,--as-needed -Wl,--no-copy-dt-needed-entries
+
+CFLAGS += -I include
+
+all: out/split_for_sort
+
+
+
+clean: out
+	rm -fv out/debug/*
+	rm -fv out/*
+	rm -rfv out
+
+out:
+	mkdir out
+
+out/debug: out
+	mkdir out/debug
+
+# Lazy, not splitting compiler & linker calls
+out/split_for_sort: out src/split_for_sort.c include/output.h
+	${CC} -o $@ ${CFLAGS} ${PROD_CFLAGS} src/split_for_sort.c
+
+out/debug/split_for_sort_debug: out/debug src/split_for_sort.c include/output.h
+	${CC} -o $@ ${CFLAGS} ${DEBUG_CFLAGS} src/split_for_sort.c
+
+
diff --git a/include/hex_conversion.h b/include/hex_conversion.h
new file mode 100644
index 0000000..b31216d
--- /dev/null
+++ b/include/hex_conversion.h
@@ -0,0 +1,96 @@
+/*
+ * vim:ts=4:sw=4:expandtab
+ */
+#ifndef HEX_CONVERSION_H
+#define HEX_CONVERSION_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#ifdef DEBUGBUILD
+#include "output.h"
+#endif
+
+#define ishex_macro(c) ((c>='0' && c <= '9') || (c>='A' && c <= 'F') || (c>='a' && c <= 'f'))
+
+int convert_line(char *s);
+int ishex(char c);
+char *convert_to_binary(char *hex, char *out);
+char *convert_from_binary(char *bin, size_t l, char *out);
+
+/* short inline functions are fine in header */
+inline int convert_line(char *s) {
+    size_t i = 0, l = 0;
+    if (s == NULL)
+        return -1;
+    l=strlen(s);
+    for (i=0; i<l; i++) {
+        s[i] = toupper(s[i]);
+        if ((s[i] == '\r' && (l-i<3)) || (s[i] == '\n' && i==l-1)) {
+            s[i] = '\0';
+            break;
+        }
+    }
+    return 0;
+}
+
+inline int ishex(char c) {
+    if ((c>='0' && c <= '9') || (c>='A' && c <= 'F') || (c>='a' && c <= 'f')) {
+        return 1;
+    }
+    return 0;
+};
+
+inline char *convert_to_binary(char *hex, char *out) {
+    char tmp[3] = {0,0,0};
+    size_t length, i;
+    if (hex == NULL) return NULL;
+    length=strlen(hex);
+    if ( (length==0) || (length%2 == 1)) return NULL;
+    for (i=0; i<length; i++) {
+        if ( ! ishex_macro(hex[i]) ) {
+#ifdef DEBUGBUILD
+            LOGERR("Incompatible string '%s'\n", hex);
+#endif
+            return NULL;
+        }
+    }
+    if (out == NULL && ((out = calloc((length/2),sizeof(char))) == NULL)) {
+#ifdef DEBUGBUILD
+        LOGERR("ERROR: Failed to allocate %d bytes\n", (length/2));
+#endif
+        return NULL;
+    }
+    for (i=0;i<length;i+=2) {
+        tmp[0] = hex[i];
+        tmp[1] = hex[i+1];
+        out[i/2] = (char) strtol(tmp, NULL, 16);
+    }
+    return out;
+}
+
+/* Use a large buffer and complex method, as with a simple
+ * way there regularly were corrupt results with gcc -O2. */
+inline char *convert_from_binary(char *bin, size_t l, char *out) {
+    char tmp[24];
+    size_t i,pos;
+    if (bin == NULL || l == 0) return NULL;
+    if (out == NULL && (out = calloc(((l*2)+1),sizeof(char))) == NULL) {
+#ifdef DEBUGBUILD
+        LOGERR("ERROR: Failed to allocate %lu bytes\n", ((l*2)+1));
+#endif
+        return NULL;
+    }
+    for (i=0; i<l; i++) {
+        /* Keep in mind this format is not only two characters. */
+        sprintf(tmp, "%02X", (unsigned char)bin[i]);
+        pos=strlen(tmp);
+        out[i*2] = tmp[pos-2];
+        out[(i*2)+1] = tmp[pos-1];
+    }
+    return out;
+}
+
+#endif
+
diff --git a/include/output.h b/include/output.h
new file mode 100644
index 0000000..efc7487
--- /dev/null
+++ b/include/output.h
@@ -0,0 +1,20 @@
+/*
+ * vim:ts=4:sw=4:expandtab
+ */
+#ifndef OUTPUT_H
+#define OUTPUT_H
+
+#include <stdio.h>
+
+#ifndef LOGERR
+#define LOGERR(...) {fprintf(stderr, "[%s:%d] %s: ", __FILE__, __LINE__, __func__); fprintf(stderr, __VA_ARGS__);}
+#endif
+
+#ifdef DEBUGBUILD
+#define DBGTRC(...) LOGERR(__VA_ARGS__)
+#else
+#define DBGTRC(...)
+#endif
+
+#endif
+
diff --git a/src/split_for_sort.c b/src/split_for_sort.c
new file mode 100644
index 0000000..8f5d8b8
--- /dev/null
+++ b/src/split_for_sort.c
@@ -0,0 +1,367 @@
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/* Copyright 2025 Thorsten Töpper
+ *
+ * It takes quite some time for sort if it's about text files with multiple
+ * gigabytes content. Split the content into buckets and afterwards sort those.
+ * Once done the content of those sorted temporary files can be merged via cat.
+ *
+ * vim:ts=4:sw=4:expandtab
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <errno.h>
+#include <sys/stat.h>
+
+#include "output.h"
+
+/* Also used for paths => at least PATH_MAX */
+#define BUFFERSIZE 4096
+
+/* The arguments will be
+ * PREFIX - file prefix...
+ * SPLIT_LENGTH - the number of characters which will be relevant for file selection
+ * FILES to split
+ */
+struct list_node {
+    char *name;
+    struct list_node *next;
+    FILE *fd;
+};
+
+struct list_head {
+    struct list_node *first;
+    size_t splitlength;
+    size_t length;
+};
+
+int push_into_list_unique(struct list_head *list, char *name);
+int split_into_files(struct list_head *list, char *inputfile, char *prefix);
+size_t list_check_length(struct list_head *list);
+struct list_head *extend_list(struct list_head *list, size_t splitlength, char *fname);
+struct list_node *get_node(struct list_head *list, char *name);
+void destroy_list(struct list_head *list);
+void fflush_list(struct list_head *list);
+void print_list(struct list_head *list, FILE *out);
+
+void destroy_list(struct list_head *list) {
+    struct list_node *n = NULL, *b = NULL;
+    if (list == NULL)
+        return;
+    n = list->first;
+    while (n != NULL) {
+        b = n->next;
+        if (n->name != NULL) free(n->name);
+        if (n->fd != NULL) fclose(n->fd);
+        free(n);
+        n = b;
+    }
+    free(list);
+}
+
+void fflush_list(struct list_head *list) {
+    struct list_node *n = NULL;
+    if (list == NULL) return;
+    n = list->first;
+    while (n != NULL) {
+        if (n->fd != NULL) {
+            fflush(n->fd);
+        }
+        n = n->next;
+    }
+}
+
+void print_list(struct list_head *list, FILE *out) {
+    struct list_node *n = NULL;
+    if (list == NULL)
+        return;
+    if (out == NULL) out = stderr;
+    fprintf(out, "length: %lu\n",list_check_length(list));
+    n = list->first;
+    while (n != NULL) {
+        fprintf(out, "node '%s' %s\n", n->name,
+            ((n->fd != NULL) ? "open file descriptor" : ""));
+        n = n->next;
+    }
+}
+
+inline size_t list_check_length(struct list_head *list) {
+    size_t l = 0;
+    struct list_node *ptr = NULL;
+    if (list == NULL) {
+        LOGERR("ERROR: No list given.\n");
+        return 0;
+    }
+    ptr = list->first;
+    while (ptr != NULL) {
+        l++;
+        ptr = ptr->next;
+    }
+    list->length = l;
+    return l;
+}
+
+inline int push_into_list_unique(struct list_head *list, char *name) {
+    size_t name_length = 0; /* required for __builtin___strncpy_chk */
+    struct list_node *ptr = NULL, *tmp = NULL;
+    if (list == NULL || name == NULL || name[0] == '\0') {
+        LOGERR("ERROR: Invalid function arguments.\n");
+        return -1;
+    }
+    ptr = list->first;
+    while (ptr != NULL) {
+        if (strcmp(ptr->name, name) == 0) {
+            return 0;
+        }
+        ptr = ptr->next;
+    }
+
+    if ((tmp = calloc(1, sizeof(struct list_node))) == NULL) {
+        LOGERR("ERROR: Failed to allocate memory for new node\n");
+        return -2;
+    }
+
+    name_length = strlen(name);
+    if ((tmp->name = calloc(name_length+2, sizeof(char))) == NULL) {
+        LOGERR("ERROR Failed to allocate %lu bytes for data in list\n", strlen(name)+2);
+        free(tmp);
+        return -3;
+    }
+    strncpy(tmp->name, name, name_length+1);
+    tmp->fd = NULL;
+    tmp->next = list->first;
+    list->first = tmp;
+    list->length++;
+    return 0;
+}
+
+inline struct list_node *get_node(struct list_head *list, char *name) {
+    struct list_node *n = NULL;
+    if (list == NULL || name == NULL || name[0] == '\0') {
+        LOGERR("ERROR: Invalid arguments\n");
+        return NULL;
+    }
+
+    n = list->first;
+    while (n != NULL) {
+        if (strcmp(n->name, name) == 0)
+            return n;
+        n = n->next;
+    }
+
+    return NULL;
+}
+
+/* Open the given file and extend / create a list of output filenames,
+ * based on the read content.
+ * splitlength - the length of a line considered regarding the comparision.
+ */
+struct list_head *extend_list(struct list_head *list, size_t splitlength, char *fname) {
+    FILE *fd = NULL;
+    char *line = NULL, *previous = NULL;
+    size_t i = 0, len = 0;
+
+    if (fname == NULL || fname[0] == '\0' || splitlength == 0) {
+        LOGERR("ERROR: filename or splitlength invalid\n");
+        return NULL;
+    }
+
+    if ((fd=fopen(fname, "r")) == NULL) {
+        LOGERR("ERROR: Failed to open file '%s': %s\n", fname, strerror(errno));
+        return NULL;
+    }
+
+    if ((line = calloc(BUFFERSIZE, sizeof(char))) == NULL) {
+        LOGERR("ERROR: Failed to allocate memory for read buffer\n");
+        fclose(fd);
+        return NULL;
+    }
+
+    if ((previous = calloc(BUFFERSIZE, sizeof(char))) == NULL) {
+        LOGERR("ERROR: Failed to allocate memory for read buffer\n");
+        free(line);
+        fclose(fd);
+        return NULL;
+    }
+
+    if ((list == NULL) && \
+        ((list = calloc(1, sizeof(struct list_head))) == NULL)) {
+            LOGERR("ERROR: Failed to create new list");
+            free(line);
+            fclose(fd);
+            return NULL;
+    }
+
+    list->splitlength = splitlength;
+
+    while (fgets(line, BUFFERSIZE, fd) != NULL) {
+        len = strlen(line);
+        len = (splitlength < len) ? splitlength : len;
+        /* most simple way to stick with FS compatible characters */
+        for (i=0; i<len; i++) {
+            if (! isalnum(line[i]) ) {
+                line[i] = '_';
+            }
+        }
+        line[len] = '\0';
+        if (strncmp(line, previous, splitlength) != 0) {
+            if (push_into_list_unique(list, line) != 0) {
+                LOGERR("WARNING: Failed to insert '%s' into list.\n", line);
+                continue;
+            }
+            strncpy(previous, line, splitlength);
+        }
+    }
+
+    fclose(fd);
+    free(line);
+    free(previous);
+
+    return list;
+}
+
+/* Parse the inputfile and split it's content into the given list of targets
+ * with the prefix for the name of the target.
+ */
+int split_into_files(struct list_head *list, char *inputfile, char *prefix) {
+    FILE *fdin = NULL;
+    char *line = NULL, *filename = NULL, *line_head = NULL, *prev_head = NULL;
+    size_t i = 0, len=0, sl = 0;
+    struct list_node *node = NULL;
+
+
+    if (inputfile == NULL || strlen(inputfile) == 0) {
+        LOGERR("ERROR: no valid filename for input.\n");
+        return -1;
+    }
+    if (list == NULL) {
+        LOGERR("ERROR: No list regarding output files given\n");
+        return -1;
+    }
+
+    if ((line = calloc(BUFFERSIZE, sizeof(char))) == NULL) {
+        LOGERR("ERROR: Failed to allocate memory for input buffer.\n");
+        return -2;
+    }
+    if ((line_head = calloc(BUFFERSIZE, sizeof(char))) == NULL) {
+        LOGERR("ERROR: Failed to allocate memory for input buffer.\n");
+        free(line);
+        return -2;
+    }
+    if ((prev_head = calloc(BUFFERSIZE, sizeof(char))) == NULL) {
+        LOGERR("ERROR: Failed to allocate memory for input buffer.\n");
+        free(line);
+        free(line_head);
+        return -2;
+    }
+    if ((filename = calloc(BUFFERSIZE, sizeof(char))) == NULL) {
+        LOGERR("ERROR: Failed to allocate memory for input buffer.\n");
+        free(line);
+        free(line_head);
+        free(prev_head);
+        return -2;
+    }
+
+    if ((fdin = fopen(inputfile, "r")) == NULL) {
+        LOGERR("ERROR: Failed to open file '%s' to read: %s\n",
+                inputfile, strerror(errno));
+        free(filename);
+        free(line);
+        free(line_head);
+        free(prev_head);
+        return -3;
+    }
+
+    /* Optical attraction: work action happens below */
+    sl = list->splitlength;
+    while (fgets(line, BUFFERSIZE, fdin) != NULL) {
+        strncpy(line_head, line, sl+1);
+        len = strnlen(line_head, sl);
+        len = (len<sl) ? len : sl;
+        for (i=0; i<len; i++) {
+            if (! isalnum(line_head[i]) ) {
+                line_head[i] = '_';
+            }
+        }
+        line_head[len] = '\0';
+
+        /* Gain performance if some files are partly sorted */
+        if (strcmp(line_head, prev_head) != 0) {
+            node = get_node(list, line_head);
+            if (node == NULL) {
+                LOGERR("WARNING: no node found for %s - skip line\n", line_head);
+                continue;
+            }
+
+            /* keep the file descriptors open across the input files to be split */
+            if (node->fd == NULL) {
+                snprintf(filename, BUFFERSIZE, "%s%s", ((prefix==NULL)?"":prefix), node->name);
+                if ((node->fd = fopen(filename, "w")) == NULL) {
+                    LOGERR("ERROR: Failed to open file '%s': %s\n", filename, strerror(errno));
+                    free(line);
+                    free(line_head);
+                    free(filename);
+                    fclose(fdin);
+                    return -4;
+                }
+            }
+            strncpy(prev_head, line_head, sl);
+        }
+
+        if (fputs(line, node->fd) == EOF) {
+            LOGERR("ERROR: Failed to write into output file for '%s': %s\n",
+                    line_head, strerror(errno));
+            free(line);
+            free(line_head);
+            free(filename);
+            fclose(fdin);
+            return -5;
+        }
+    }
+
+    fflush_list(list);
+    fclose(fdin);
+    free(line);
+    free(line_head);
+    free(filename);
+    return 0;
+}
+
+int main(int argc, char **argv) {
+    int i = 3;
+    size_t splitlength = 0;
+    struct list_head *list = NULL;
+
+    if (argc < 4) {
+        fprintf(stderr, "Usage: %s prefix length files...\n\n", argv[0]);
+        fprintf(stderr, "\tprefix - used with the output filenames\n"
+                        "\tlength - the number of characters relevant for comparing\n");
+        return EXIT_FAILURE;
+    }
+
+    splitlength = strtoull(argv[2], NULL, 10);
+    if (splitlength == 0) {
+        LOGERR("ERROR: Failed to read valid length from argument '%s' base 10 number >=1 expected\n", argv[2]);
+        return EXIT_FAILURE;
+    }
+
+    for (i=3; i<argc; i++) {
+//        fprintf(stderr, "=> Scan '%s' for bucket filenames\n", argv[i]);
+        list = extend_list(list, splitlength, argv[i]);
+    }
+
+    for (i=3; i<argc; i++) {
+//        fprintf(stderr,"=> Split '%s' into bucket files\n", argv[i]);
+        if (split_into_files(list, argv[i], argv[1]) < 0) {
+            destroy_list(list);
+            return EXIT_FAILURE;
+        }
+    }
+
+    destroy_list(list);
+
+    return EXIT_SUCCESS;
+}
+
author	Thorsten Töpper <atsutane@freethoughts.de>	2025-08-10 18:16:07 +0200
committer	Thorsten Töpper <atsutane@freethoughts.de>	2025-08-10 18:16:07 +0200
commit	9e2f3d59cf249403859916df9756c179753ea7e0 (patch)
tree	6aaacfd22fc681fb7d95826ef65726c392cfc7d8
parent	5b743929d23ca0e8004fe2d6bc8ff5c04ed9dbb9 (diff)
download	small-utils-9e2f3d59cf249403859916df9756c179753ea7e0.tar.gz small-utils-9e2f3d59cf249403859916df9756c179753ea7e0.tar.bz2