aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThorsten Töpper <atsutane@freethoughts.de>2025-08-10 18:16:07 +0200
committerThorsten Töpper <atsutane@freethoughts.de>2025-08-10 18:16:07 +0200
commit9e2f3d59cf249403859916df9756c179753ea7e0 (patch)
tree6aaacfd22fc681fb7d95826ef65726c392cfc7d8
parent5b743929d23ca0e8004fe2d6bc8ff5c04ed9dbb9 (diff)
downloadsmall-utils-9e2f3d59cf249403859916df9756c179753ea7e0.tar.gz
small-utils-9e2f3d59cf249403859916df9756c179753ea7e0.tar.bz2
split_for_sort: Split a given file into buckets
The target bucket is decided based on the first X characters of a line. The bucket name gets a prefix defined as argument and can be sorted faster on weak hardware. Note: This is just a split alternative. Real world usage in a shell script with a file in which the first 10 characters are the equal in each line, the following 2 bytes are evaluated for splitting: split_for_sort TMPSFS 12 raw_data.txt for f in TMPSFS ; do sort -o "${f}_sorted" -u "${f}" done \# Rely on the argument resolution to go with lexical order cat TMPSFS*_sorted > sorted_data.txt rm TMPSFS*
-rw-r--r--.gitignore2
-rw-r--r--Makefile35
-rw-r--r--include/hex_conversion.h96
-rw-r--r--include/output.h20
-rw-r--r--src/split_for_sort.c367
5 files changed, 519 insertions, 1 deletions
diff --git a/.gitignore b/.gitignore
index c82077c..0047e56 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
-bin/*
+out/*
man/*.1
man/*.gz
.gdbinit
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..4d8a55d
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,35 @@
+
+
+CFLAGS += -Wall -Wextra -Wformat -Wformat=2 -Wconversion -Wimplicit-fallthrough -Werror=format-security \
+ -Wbidi-chars=any -Werror=implicit -Werror=incompatible-pointer-types -Werror=int-conversion \
+ -fno-delete-null-pointer-checks -fstack-clash-protection -fstack-protector-strong -fstrict-flex-arrays=3
+
+DEBUG_CFLAGS = -DDEBUGBUILD -g -Werror -fsanitize=address
+
+PROD_CFLAGS = -O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3 -FPIE -pie -Wl,-z,noexecstack -Wl,--as-needed -Wl,--no-copy-dt-needed-entries
+
+CFLAGS += -I include
+
+all: out/split_for_sort
+
+
+
+clean: out
+ rm -fv out/debug/*
+ rm -fv out/*
+ rm -rfv out
+
+out:
+ mkdir out
+
+out/debug: out
+ mkdir out/debug
+
+# Lazy, not splitting compiler & linker calls
+out/split_for_sort: out src/split_for_sort.c include/output.h
+ ${CC} -o $@ ${CFLAGS} ${PROD_CFLAGS} src/split_for_sort.c
+
+out/debug/split_for_sort_debug: out/debug src/split_for_sort.c include/output.h
+ ${CC} -o $@ ${CFLAGS} ${DEBUG_CFLAGS} src/split_for_sort.c
+
+
diff --git a/include/hex_conversion.h b/include/hex_conversion.h
new file mode 100644
index 0000000..b31216d
--- /dev/null
+++ b/include/hex_conversion.h
@@ -0,0 +1,96 @@
+/*
+ * vim:ts=4:sw=4:expandtab
+ */
+#ifndef HEX_CONVERSION_H
+#define HEX_CONVERSION_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#ifdef DEBUGBUILD
+#include "output.h"
+#endif
+
+#define ishex_macro(c) ((c>='0' && c <= '9') || (c>='A' && c <= 'F') || (c>='a' && c <= 'f'))
+
+int convert_line(char *s);
+int ishex(char c);
+char *convert_to_binary(char *hex, char *out);
+char *convert_from_binary(char *bin, size_t l, char *out);
+
+/* short inline functions are fine in header */
+inline int convert_line(char *s) {
+ size_t i = 0, l = 0;
+ if (s == NULL)
+ return -1;
+ l=strlen(s);
+ for (i=0; i<l; i++) {
+ s[i] = toupper(s[i]);
+ if ((s[i] == '\r' && (l-i<3)) || (s[i] == '\n' && i==l-1)) {
+ s[i] = '\0';
+ break;
+ }
+ }
+ return 0;
+}
+
+inline int ishex(char c) {
+ if ((c>='0' && c <= '9') || (c>='A' && c <= 'F') || (c>='a' && c <= 'f')) {
+ return 1;
+ }
+ return 0;
+};
+
+inline char *convert_to_binary(char *hex, char *out) {
+ char tmp[3] = {0,0,0};
+ size_t length, i;
+ if (hex == NULL) return NULL;
+ length=strlen(hex);
+ if ( (length==0) || (length%2 == 1)) return NULL;
+ for (i=0; i<length; i++) {
+ if ( ! ishex_macro(hex[i]) ) {
+#ifdef DEBUGBUILD
+ LOGERR("Incompatible string '%s'\n", hex);
+#endif
+ return NULL;
+ }
+ }
+ if (out == NULL && ((out = calloc((length/2),sizeof(char))) == NULL)) {
+#ifdef DEBUGBUILD
+ LOGERR("ERROR: Failed to allocate %d bytes\n", (length/2));
+#endif
+ return NULL;
+ }
+ for (i=0;i<length;i+=2) {
+ tmp[0] = hex[i];
+ tmp[1] = hex[i+1];
+ out[i/2] = (char) strtol(tmp, NULL, 16);
+ }
+ return out;
+}
+
+/* Use a large buffer and complex method, as with a simple
+ * way there regularly were corrupt results with gcc -O2. */
+inline char *convert_from_binary(char *bin, size_t l, char *out) {
+ char tmp[24];
+ size_t i,pos;
+ if (bin == NULL || l == 0) return NULL;
+ if (out == NULL && (out = calloc(((l*2)+1),sizeof(char))) == NULL) {
+#ifdef DEBUGBUILD
+ LOGERR("ERROR: Failed to allocate %lu bytes\n", ((l*2)+1));
+#endif
+ return NULL;
+ }
+ for (i=0; i<l; i++) {
+ /* Keep in mind this format is not only two characters. */
+ sprintf(tmp, "%02X", (unsigned char)bin[i]);
+ pos=strlen(tmp);
+ out[i*2] = tmp[pos-2];
+ out[(i*2)+1] = tmp[pos-1];
+ }
+ return out;
+}
+
+#endif
+
diff --git a/include/output.h b/include/output.h
new file mode 100644
index 0000000..efc7487
--- /dev/null
+++ b/include/output.h
@@ -0,0 +1,20 @@
+/*
+ * vim:ts=4:sw=4:expandtab
+ */
+#ifndef OUTPUT_H
+#define OUTPUT_H
+
+#include <stdio.h>
+
+#ifndef LOGERR
+#define LOGERR(...) {fprintf(stderr, "[%s:%d] %s: ", __FILE__, __LINE__, __func__); fprintf(stderr, __VA_ARGS__);}
+#endif
+
+#ifdef DEBUGBUILD
+#define DBGTRC(...) LOGERR(__VA_ARGS__)
+#else
+#define DBGTRC(...)
+#endif
+
+#endif
+
diff --git a/src/split_for_sort.c b/src/split_for_sort.c
new file mode 100644
index 0000000..8f5d8b8
--- /dev/null
+++ b/src/split_for_sort.c
@@ -0,0 +1,367 @@
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/* Copyright 2025 Thorsten Töpper
+ *
+ * It takes quite some time for sort if it's about text files with multiple
+ * gigabytes content. Split the content into buckets and afterwards sort those.
+ * Once done the content of those sorted temporary files can be merged via cat.
+ *
+ * vim:ts=4:sw=4:expandtab
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <errno.h>
+#include <sys/stat.h>
+
+#include "output.h"
+
+/* Also used for paths => at least PATH_MAX */
+#define BUFFERSIZE 4096
+
+/* The arguments will be
+ * PREFIX - file prefix...
+ * SPLIT_LENGTH - the number of characters which will be relevant for file selection
+ * FILES to split
+ */
+struct list_node {
+ char *name;
+ struct list_node *next;
+ FILE *fd;
+};
+
+struct list_head {
+ struct list_node *first;
+ size_t splitlength;
+ size_t length;
+};
+
+int push_into_list_unique(struct list_head *list, char *name);
+int split_into_files(struct list_head *list, char *inputfile, char *prefix);
+size_t list_check_length(struct list_head *list);
+struct list_head *extend_list(struct list_head *list, size_t splitlength, char *fname);
+struct list_node *get_node(struct list_head *list, char *name);
+void destroy_list(struct list_head *list);
+void fflush_list(struct list_head *list);
+void print_list(struct list_head *list, FILE *out);
+
+void destroy_list(struct list_head *list) {
+ struct list_node *n = NULL, *b = NULL;
+ if (list == NULL)
+ return;
+ n = list->first;
+ while (n != NULL) {
+ b = n->next;
+ if (n->name != NULL) free(n->name);
+ if (n->fd != NULL) fclose(n->fd);
+ free(n);
+ n = b;
+ }
+ free(list);
+}
+
+void fflush_list(struct list_head *list) {
+ struct list_node *n = NULL;
+ if (list == NULL) return;
+ n = list->first;
+ while (n != NULL) {
+ if (n->fd != NULL) {
+ fflush(n->fd);
+ }
+ n = n->next;
+ }
+}
+
+void print_list(struct list_head *list, FILE *out) {
+ struct list_node *n = NULL;
+ if (list == NULL)
+ return;
+ if (out == NULL) out = stderr;
+ fprintf(out, "length: %lu\n",list_check_length(list));
+ n = list->first;
+ while (n != NULL) {
+ fprintf(out, "node '%s' %s\n", n->name,
+ ((n->fd != NULL) ? "open file descriptor" : ""));
+ n = n->next;
+ }
+}
+
+inline size_t list_check_length(struct list_head *list) {
+ size_t l = 0;
+ struct list_node *ptr = NULL;
+ if (list == NULL) {
+ LOGERR("ERROR: No list given.\n");
+ return 0;
+ }
+ ptr = list->first;
+ while (ptr != NULL) {
+ l++;
+ ptr = ptr->next;
+ }
+ list->length = l;
+ return l;
+}
+
+inline int push_into_list_unique(struct list_head *list, char *name) {
+ size_t name_length = 0; /* required for __builtin___strncpy_chk */
+ struct list_node *ptr = NULL, *tmp = NULL;
+ if (list == NULL || name == NULL || name[0] == '\0') {
+ LOGERR("ERROR: Invalid function arguments.\n");
+ return -1;
+ }
+ ptr = list->first;
+ while (ptr != NULL) {
+ if (strcmp(ptr->name, name) == 0) {
+ return 0;
+ }
+ ptr = ptr->next;
+ }
+
+ if ((tmp = calloc(1, sizeof(struct list_node))) == NULL) {
+ LOGERR("ERROR: Failed to allocate memory for new node\n");
+ return -2;
+ }
+
+ name_length = strlen(name);
+ if ((tmp->name = calloc(name_length+2, sizeof(char))) == NULL) {
+ LOGERR("ERROR Failed to allocate %lu bytes for data in list\n", strlen(name)+2);
+ free(tmp);
+ return -3;
+ }
+ strncpy(tmp->name, name, name_length+1);
+ tmp->fd = NULL;
+ tmp->next = list->first;
+ list->first = tmp;
+ list->length++;
+ return 0;
+}
+
+inline struct list_node *get_node(struct list_head *list, char *name) {
+ struct list_node *n = NULL;
+ if (list == NULL || name == NULL || name[0] == '\0') {
+ LOGERR("ERROR: Invalid arguments\n");
+ return NULL;
+ }
+
+ n = list->first;
+ while (n != NULL) {
+ if (strcmp(n->name, name) == 0)
+ return n;
+ n = n->next;
+ }
+
+ return NULL;
+}
+
+/* Open the given file and extend / create a list of output filenames,
+ * based on the read content.
+ * splitlength - the length of a line considered regarding the comparision.
+ */
+struct list_head *extend_list(struct list_head *list, size_t splitlength, char *fname) {
+ FILE *fd = NULL;
+ char *line = NULL, *previous = NULL;
+ size_t i = 0, len = 0;
+
+ if (fname == NULL || fname[0] == '\0' || splitlength == 0) {
+ LOGERR("ERROR: filename or splitlength invalid\n");
+ return NULL;
+ }
+
+ if ((fd=fopen(fname, "r")) == NULL) {
+ LOGERR("ERROR: Failed to open file '%s': %s\n", fname, strerror(errno));
+ return NULL;
+ }
+
+ if ((line = calloc(BUFFERSIZE, sizeof(char))) == NULL) {
+ LOGERR("ERROR: Failed to allocate memory for read buffer\n");
+ fclose(fd);
+ return NULL;
+ }
+
+ if ((previous = calloc(BUFFERSIZE, sizeof(char))) == NULL) {
+ LOGERR("ERROR: Failed to allocate memory for read buffer\n");
+ free(line);
+ fclose(fd);
+ return NULL;
+ }
+
+ if ((list == NULL) && \
+ ((list = calloc(1, sizeof(struct list_head))) == NULL)) {
+ LOGERR("ERROR: Failed to create new list");
+ free(line);
+ fclose(fd);
+ return NULL;
+ }
+
+ list->splitlength = splitlength;
+
+ while (fgets(line, BUFFERSIZE, fd) != NULL) {
+ len = strlen(line);
+ len = (splitlength < len) ? splitlength : len;
+ /* most simple way to stick with FS compatible characters */
+ for (i=0; i<len; i++) {
+ if (! isalnum(line[i]) ) {
+ line[i] = '_';
+ }
+ }
+ line[len] = '\0';
+ if (strncmp(line, previous, splitlength) != 0) {
+ if (push_into_list_unique(list, line) != 0) {
+ LOGERR("WARNING: Failed to insert '%s' into list.\n", line);
+ continue;
+ }
+ strncpy(previous, line, splitlength);
+ }
+ }
+
+ fclose(fd);
+ free(line);
+ free(previous);
+
+ return list;
+}
+
+/* Parse the inputfile and split it's content into the given list of targets
+ * with the prefix for the name of the target.
+ */
+int split_into_files(struct list_head *list, char *inputfile, char *prefix) {
+ FILE *fdin = NULL;
+ char *line = NULL, *filename = NULL, *line_head = NULL, *prev_head = NULL;
+ size_t i = 0, len=0, sl = 0;
+ struct list_node *node = NULL;
+
+
+ if (inputfile == NULL || strlen(inputfile) == 0) {
+ LOGERR("ERROR: no valid filename for input.\n");
+ return -1;
+ }
+ if (list == NULL) {
+ LOGERR("ERROR: No list regarding output files given\n");
+ return -1;
+ }
+
+ if ((line = calloc(BUFFERSIZE, sizeof(char))) == NULL) {
+ LOGERR("ERROR: Failed to allocate memory for input buffer.\n");
+ return -2;
+ }
+ if ((line_head = calloc(BUFFERSIZE, sizeof(char))) == NULL) {
+ LOGERR("ERROR: Failed to allocate memory for input buffer.\n");
+ free(line);
+ return -2;
+ }
+ if ((prev_head = calloc(BUFFERSIZE, sizeof(char))) == NULL) {
+ LOGERR("ERROR: Failed to allocate memory for input buffer.\n");
+ free(line);
+ free(line_head);
+ return -2;
+ }
+ if ((filename = calloc(BUFFERSIZE, sizeof(char))) == NULL) {
+ LOGERR("ERROR: Failed to allocate memory for input buffer.\n");
+ free(line);
+ free(line_head);
+ free(prev_head);
+ return -2;
+ }
+
+ if ((fdin = fopen(inputfile, "r")) == NULL) {
+ LOGERR("ERROR: Failed to open file '%s' to read: %s\n",
+ inputfile, strerror(errno));
+ free(filename);
+ free(line);
+ free(line_head);
+ free(prev_head);
+ return -3;
+ }
+
+ /* Optical attraction: work action happens below */
+ sl = list->splitlength;
+ while (fgets(line, BUFFERSIZE, fdin) != NULL) {
+ strncpy(line_head, line, sl+1);
+ len = strnlen(line_head, sl);
+ len = (len<sl) ? len : sl;
+ for (i=0; i<len; i++) {
+ if (! isalnum(line_head[i]) ) {
+ line_head[i] = '_';
+ }
+ }
+ line_head[len] = '\0';
+
+ /* Gain performance if some files are partly sorted */
+ if (strcmp(line_head, prev_head) != 0) {
+ node = get_node(list, line_head);
+ if (node == NULL) {
+ LOGERR("WARNING: no node found for %s - skip line\n", line_head);
+ continue;
+ }
+
+ /* keep the file descriptors open across the input files to be split */
+ if (node->fd == NULL) {
+ snprintf(filename, BUFFERSIZE, "%s%s", ((prefix==NULL)?"":prefix), node->name);
+ if ((node->fd = fopen(filename, "w")) == NULL) {
+ LOGERR("ERROR: Failed to open file '%s': %s\n", filename, strerror(errno));
+ free(line);
+ free(line_head);
+ free(filename);
+ fclose(fdin);
+ return -4;
+ }
+ }
+ strncpy(prev_head, line_head, sl);
+ }
+
+ if (fputs(line, node->fd) == EOF) {
+ LOGERR("ERROR: Failed to write into output file for '%s': %s\n",
+ line_head, strerror(errno));
+ free(line);
+ free(line_head);
+ free(filename);
+ fclose(fdin);
+ return -5;
+ }
+ }
+
+ fflush_list(list);
+ fclose(fdin);
+ free(line);
+ free(line_head);
+ free(filename);
+ return 0;
+}
+
+int main(int argc, char **argv) {
+ int i = 3;
+ size_t splitlength = 0;
+ struct list_head *list = NULL;
+
+ if (argc < 4) {
+ fprintf(stderr, "Usage: %s prefix length files...\n\n", argv[0]);
+ fprintf(stderr, "\tprefix - used with the output filenames\n"
+ "\tlength - the number of characters relevant for comparing\n");
+ return EXIT_FAILURE;
+ }
+
+ splitlength = strtoull(argv[2], NULL, 10);
+ if (splitlength == 0) {
+ LOGERR("ERROR: Failed to read valid length from argument '%s' base 10 number >=1 expected\n", argv[2]);
+ return EXIT_FAILURE;
+ }
+
+ for (i=3; i<argc; i++) {
+// fprintf(stderr, "=> Scan '%s' for bucket filenames\n", argv[i]);
+ list = extend_list(list, splitlength, argv[i]);
+ }
+
+ for (i=3; i<argc; i++) {
+// fprintf(stderr,"=> Split '%s' into bucket files\n", argv[i]);
+ if (split_into_files(list, argv[i], argv[1]) < 0) {
+ destroy_list(list);
+ return EXIT_FAILURE;
+ }
+ }
+
+ destroy_list(list);
+
+ return EXIT_SUCCESS;
+}
+