diff options
| -rw-r--r-- | .gitignore | 2 | ||||
| -rw-r--r-- | Makefile | 35 | ||||
| -rw-r--r-- | include/hex_conversion.h | 96 | ||||
| -rw-r--r-- | include/output.h | 20 | ||||
| -rw-r--r-- | src/split_for_sort.c | 367 |
5 files changed, 519 insertions, 1 deletions
@@ -1,4 +1,4 @@ -bin/* +out/* man/*.1 man/*.gz .gdbinit diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4d8a55d --- /dev/null +++ b/Makefile @@ -0,0 +1,35 @@ + + +CFLAGS += -Wall -Wextra -Wformat -Wformat=2 -Wconversion -Wimplicit-fallthrough -Werror=format-security \ + -Wbidi-chars=any -Werror=implicit -Werror=incompatible-pointer-types -Werror=int-conversion \ + -fno-delete-null-pointer-checks -fstack-clash-protection -fstack-protector-strong -fstrict-flex-arrays=3 + +DEBUG_CFLAGS = -DDEBUGBUILD -g -Werror -fsanitize=address + +PROD_CFLAGS = -O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3 -FPIE -pie -Wl,-z,noexecstack -Wl,--as-needed -Wl,--no-copy-dt-needed-entries + +CFLAGS += -I include + +all: out/split_for_sort + + + +clean: out + rm -fv out/debug/* + rm -fv out/* + rm -rfv out + +out: + mkdir out + +out/debug: out + mkdir out/debug + +# Lazy, not splitting compiler & linker calls +out/split_for_sort: out src/split_for_sort.c include/output.h + ${CC} -o $@ ${CFLAGS} ${PROD_CFLAGS} src/split_for_sort.c + +out/debug/split_for_sort_debug: out/debug src/split_for_sort.c include/output.h + ${CC} -o $@ ${CFLAGS} ${DEBUG_CFLAGS} src/split_for_sort.c + + diff --git a/include/hex_conversion.h b/include/hex_conversion.h new file mode 100644 index 0000000..b31216d --- /dev/null +++ b/include/hex_conversion.h @@ -0,0 +1,96 @@ +/* + * vim:ts=4:sw=4:expandtab + */ +#ifndef HEX_CONVERSION_H +#define HEX_CONVERSION_H + +#include <stdlib.h> +#include <string.h> +#include <ctype.h> + +#ifdef DEBUGBUILD +#include "output.h" +#endif + +#define ishex_macro(c) ((c>='0' && c <= '9') || (c>='A' && c <= 'F') || (c>='a' && c <= 'f')) + +int convert_line(char *s); +int ishex(char c); +char *convert_to_binary(char *hex, char *out); +char *convert_from_binary(char *bin, size_t l, char *out); + +/* short inline functions are fine in header */ +inline int convert_line(char *s) { + size_t i = 0, l = 0; + if (s == NULL) + return -1; + l=strlen(s); + for (i=0; i<l; i++) { + s[i] = toupper(s[i]); + if ((s[i] == '\r' && (l-i<3)) || (s[i] == '\n' && i==l-1)) { + s[i] = '\0'; + break; + } + } + return 0; +} + +inline int ishex(char c) { + if ((c>='0' && c <= '9') || (c>='A' && c <= 'F') || (c>='a' && c <= 'f')) { + return 1; + } + return 0; +}; + +inline char *convert_to_binary(char *hex, char *out) { + char tmp[3] = {0,0,0}; + size_t length, i; + if (hex == NULL) return NULL; + length=strlen(hex); + if ( (length==0) || (length%2 == 1)) return NULL; + for (i=0; i<length; i++) { + if ( ! ishex_macro(hex[i]) ) { +#ifdef DEBUGBUILD + LOGERR("Incompatible string '%s'\n", hex); +#endif + return NULL; + } + } + if (out == NULL && ((out = calloc((length/2),sizeof(char))) == NULL)) { +#ifdef DEBUGBUILD + LOGERR("ERROR: Failed to allocate %d bytes\n", (length/2)); +#endif + return NULL; + } + for (i=0;i<length;i+=2) { + tmp[0] = hex[i]; + tmp[1] = hex[i+1]; + out[i/2] = (char) strtol(tmp, NULL, 16); + } + return out; +} + +/* Use a large buffer and complex method, as with a simple + * way there regularly were corrupt results with gcc -O2. */ +inline char *convert_from_binary(char *bin, size_t l, char *out) { + char tmp[24]; + size_t i,pos; + if (bin == NULL || l == 0) return NULL; + if (out == NULL && (out = calloc(((l*2)+1),sizeof(char))) == NULL) { +#ifdef DEBUGBUILD + LOGERR("ERROR: Failed to allocate %lu bytes\n", ((l*2)+1)); +#endif + return NULL; + } + for (i=0; i<l; i++) { + /* Keep in mind this format is not only two characters. */ + sprintf(tmp, "%02X", (unsigned char)bin[i]); + pos=strlen(tmp); + out[i*2] = tmp[pos-2]; + out[(i*2)+1] = tmp[pos-1]; + } + return out; +} + +#endif + diff --git a/include/output.h b/include/output.h new file mode 100644 index 0000000..efc7487 --- /dev/null +++ b/include/output.h @@ -0,0 +1,20 @@ +/* + * vim:ts=4:sw=4:expandtab + */ +#ifndef OUTPUT_H +#define OUTPUT_H + +#include <stdio.h> + +#ifndef LOGERR +#define LOGERR(...) {fprintf(stderr, "[%s:%d] %s: ", __FILE__, __LINE__, __func__); fprintf(stderr, __VA_ARGS__);} +#endif + +#ifdef DEBUGBUILD +#define DBGTRC(...) LOGERR(__VA_ARGS__) +#else +#define DBGTRC(...) +#endif + +#endif + diff --git a/src/split_for_sort.c b/src/split_for_sort.c new file mode 100644 index 0000000..8f5d8b8 --- /dev/null +++ b/src/split_for_sort.c @@ -0,0 +1,367 @@ +/* SPDX-License-Identifier: Apache-2.0 */ + +/* Copyright 2025 Thorsten Töpper + * + * It takes quite some time for sort if it's about text files with multiple + * gigabytes content. Split the content into buckets and afterwards sort those. + * Once done the content of those sorted temporary files can be merged via cat. + * + * vim:ts=4:sw=4:expandtab + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <errno.h> +#include <sys/stat.h> + +#include "output.h" + +/* Also used for paths => at least PATH_MAX */ +#define BUFFERSIZE 4096 + +/* The arguments will be + * PREFIX - file prefix... + * SPLIT_LENGTH - the number of characters which will be relevant for file selection + * FILES to split + */ +struct list_node { + char *name; + struct list_node *next; + FILE *fd; +}; + +struct list_head { + struct list_node *first; + size_t splitlength; + size_t length; +}; + +int push_into_list_unique(struct list_head *list, char *name); +int split_into_files(struct list_head *list, char *inputfile, char *prefix); +size_t list_check_length(struct list_head *list); +struct list_head *extend_list(struct list_head *list, size_t splitlength, char *fname); +struct list_node *get_node(struct list_head *list, char *name); +void destroy_list(struct list_head *list); +void fflush_list(struct list_head *list); +void print_list(struct list_head *list, FILE *out); + +void destroy_list(struct list_head *list) { + struct list_node *n = NULL, *b = NULL; + if (list == NULL) + return; + n = list->first; + while (n != NULL) { + b = n->next; + if (n->name != NULL) free(n->name); + if (n->fd != NULL) fclose(n->fd); + free(n); + n = b; + } + free(list); +} + +void fflush_list(struct list_head *list) { + struct list_node *n = NULL; + if (list == NULL) return; + n = list->first; + while (n != NULL) { + if (n->fd != NULL) { + fflush(n->fd); + } + n = n->next; + } +} + +void print_list(struct list_head *list, FILE *out) { + struct list_node *n = NULL; + if (list == NULL) + return; + if (out == NULL) out = stderr; + fprintf(out, "length: %lu\n",list_check_length(list)); + n = list->first; + while (n != NULL) { + fprintf(out, "node '%s' %s\n", n->name, + ((n->fd != NULL) ? "open file descriptor" : "")); + n = n->next; + } +} + +inline size_t list_check_length(struct list_head *list) { + size_t l = 0; + struct list_node *ptr = NULL; + if (list == NULL) { + LOGERR("ERROR: No list given.\n"); + return 0; + } + ptr = list->first; + while (ptr != NULL) { + l++; + ptr = ptr->next; + } + list->length = l; + return l; +} + +inline int push_into_list_unique(struct list_head *list, char *name) { + size_t name_length = 0; /* required for __builtin___strncpy_chk */ + struct list_node *ptr = NULL, *tmp = NULL; + if (list == NULL || name == NULL || name[0] == '\0') { + LOGERR("ERROR: Invalid function arguments.\n"); + return -1; + } + ptr = list->first; + while (ptr != NULL) { + if (strcmp(ptr->name, name) == 0) { + return 0; + } + ptr = ptr->next; + } + + if ((tmp = calloc(1, sizeof(struct list_node))) == NULL) { + LOGERR("ERROR: Failed to allocate memory for new node\n"); + return -2; + } + + name_length = strlen(name); + if ((tmp->name = calloc(name_length+2, sizeof(char))) == NULL) { + LOGERR("ERROR Failed to allocate %lu bytes for data in list\n", strlen(name)+2); + free(tmp); + return -3; + } + strncpy(tmp->name, name, name_length+1); + tmp->fd = NULL; + tmp->next = list->first; + list->first = tmp; + list->length++; + return 0; +} + +inline struct list_node *get_node(struct list_head *list, char *name) { + struct list_node *n = NULL; + if (list == NULL || name == NULL || name[0] == '\0') { + LOGERR("ERROR: Invalid arguments\n"); + return NULL; + } + + n = list->first; + while (n != NULL) { + if (strcmp(n->name, name) == 0) + return n; + n = n->next; + } + + return NULL; +} + +/* Open the given file and extend / create a list of output filenames, + * based on the read content. + * splitlength - the length of a line considered regarding the comparision. + */ +struct list_head *extend_list(struct list_head *list, size_t splitlength, char *fname) { + FILE *fd = NULL; + char *line = NULL, *previous = NULL; + size_t i = 0, len = 0; + + if (fname == NULL || fname[0] == '\0' || splitlength == 0) { + LOGERR("ERROR: filename or splitlength invalid\n"); + return NULL; + } + + if ((fd=fopen(fname, "r")) == NULL) { + LOGERR("ERROR: Failed to open file '%s': %s\n", fname, strerror(errno)); + return NULL; + } + + if ((line = calloc(BUFFERSIZE, sizeof(char))) == NULL) { + LOGERR("ERROR: Failed to allocate memory for read buffer\n"); + fclose(fd); + return NULL; + } + + if ((previous = calloc(BUFFERSIZE, sizeof(char))) == NULL) { + LOGERR("ERROR: Failed to allocate memory for read buffer\n"); + free(line); + fclose(fd); + return NULL; + } + + if ((list == NULL) && \ + ((list = calloc(1, sizeof(struct list_head))) == NULL)) { + LOGERR("ERROR: Failed to create new list"); + free(line); + fclose(fd); + return NULL; + } + + list->splitlength = splitlength; + + while (fgets(line, BUFFERSIZE, fd) != NULL) { + len = strlen(line); + len = (splitlength < len) ? splitlength : len; + /* most simple way to stick with FS compatible characters */ + for (i=0; i<len; i++) { + if (! isalnum(line[i]) ) { + line[i] = '_'; + } + } + line[len] = '\0'; + if (strncmp(line, previous, splitlength) != 0) { + if (push_into_list_unique(list, line) != 0) { + LOGERR("WARNING: Failed to insert '%s' into list.\n", line); + continue; + } + strncpy(previous, line, splitlength); + } + } + + fclose(fd); + free(line); + free(previous); + + return list; +} + +/* Parse the inputfile and split it's content into the given list of targets + * with the prefix for the name of the target. + */ +int split_into_files(struct list_head *list, char *inputfile, char *prefix) { + FILE *fdin = NULL; + char *line = NULL, *filename = NULL, *line_head = NULL, *prev_head = NULL; + size_t i = 0, len=0, sl = 0; + struct list_node *node = NULL; + + + if (inputfile == NULL || strlen(inputfile) == 0) { + LOGERR("ERROR: no valid filename for input.\n"); + return -1; + } + if (list == NULL) { + LOGERR("ERROR: No list regarding output files given\n"); + return -1; + } + + if ((line = calloc(BUFFERSIZE, sizeof(char))) == NULL) { + LOGERR("ERROR: Failed to allocate memory for input buffer.\n"); + return -2; + } + if ((line_head = calloc(BUFFERSIZE, sizeof(char))) == NULL) { + LOGERR("ERROR: Failed to allocate memory for input buffer.\n"); + free(line); + return -2; + } + if ((prev_head = calloc(BUFFERSIZE, sizeof(char))) == NULL) { + LOGERR("ERROR: Failed to allocate memory for input buffer.\n"); + free(line); + free(line_head); + return -2; + } + if ((filename = calloc(BUFFERSIZE, sizeof(char))) == NULL) { + LOGERR("ERROR: Failed to allocate memory for input buffer.\n"); + free(line); + free(line_head); + free(prev_head); + return -2; + } + + if ((fdin = fopen(inputfile, "r")) == NULL) { + LOGERR("ERROR: Failed to open file '%s' to read: %s\n", + inputfile, strerror(errno)); + free(filename); + free(line); + free(line_head); + free(prev_head); + return -3; + } + + /* Optical attraction: work action happens below */ + sl = list->splitlength; + while (fgets(line, BUFFERSIZE, fdin) != NULL) { + strncpy(line_head, line, sl+1); + len = strnlen(line_head, sl); + len = (len<sl) ? len : sl; + for (i=0; i<len; i++) { + if (! isalnum(line_head[i]) ) { + line_head[i] = '_'; + } + } + line_head[len] = '\0'; + + /* Gain performance if some files are partly sorted */ + if (strcmp(line_head, prev_head) != 0) { + node = get_node(list, line_head); + if (node == NULL) { + LOGERR("WARNING: no node found for %s - skip line\n", line_head); + continue; + } + + /* keep the file descriptors open across the input files to be split */ + if (node->fd == NULL) { + snprintf(filename, BUFFERSIZE, "%s%s", ((prefix==NULL)?"":prefix), node->name); + if ((node->fd = fopen(filename, "w")) == NULL) { + LOGERR("ERROR: Failed to open file '%s': %s\n", filename, strerror(errno)); + free(line); + free(line_head); + free(filename); + fclose(fdin); + return -4; + } + } + strncpy(prev_head, line_head, sl); + } + + if (fputs(line, node->fd) == EOF) { + LOGERR("ERROR: Failed to write into output file for '%s': %s\n", + line_head, strerror(errno)); + free(line); + free(line_head); + free(filename); + fclose(fdin); + return -5; + } + } + + fflush_list(list); + fclose(fdin); + free(line); + free(line_head); + free(filename); + return 0; +} + +int main(int argc, char **argv) { + int i = 3; + size_t splitlength = 0; + struct list_head *list = NULL; + + if (argc < 4) { + fprintf(stderr, "Usage: %s prefix length files...\n\n", argv[0]); + fprintf(stderr, "\tprefix - used with the output filenames\n" + "\tlength - the number of characters relevant for comparing\n"); + return EXIT_FAILURE; + } + + splitlength = strtoull(argv[2], NULL, 10); + if (splitlength == 0) { + LOGERR("ERROR: Failed to read valid length from argument '%s' base 10 number >=1 expected\n", argv[2]); + return EXIT_FAILURE; + } + + for (i=3; i<argc; i++) { +// fprintf(stderr, "=> Scan '%s' for bucket filenames\n", argv[i]); + list = extend_list(list, splitlength, argv[i]); + } + + for (i=3; i<argc; i++) { +// fprintf(stderr,"=> Split '%s' into bucket files\n", argv[i]); + if (split_into_files(list, argv[i], argv[1]) < 0) { + destroy_list(list); + return EXIT_FAILURE; + } + } + + destroy_list(list); + + return EXIT_SUCCESS; +} + |
