diff options
| author | Thorsten Töpper <atsutane@freethoughts.de> | 2025-08-10 18:16:07 +0200 |
|---|---|---|
| committer | Thorsten Töpper <atsutane@freethoughts.de> | 2025-08-10 18:16:07 +0200 |
| commit | 9e2f3d59cf249403859916df9756c179753ea7e0 (patch) | |
| tree | 6aaacfd22fc681fb7d95826ef65726c392cfc7d8 /include | |
| parent | 5b743929d23ca0e8004fe2d6bc8ff5c04ed9dbb9 (diff) | |
| download | small-utils-9e2f3d59cf249403859916df9756c179753ea7e0.tar.gz small-utils-9e2f3d59cf249403859916df9756c179753ea7e0.tar.bz2 | |
split_for_sort: Split a given file into buckets
The target bucket is decided based on the first X characters of a line.
The bucket name gets a prefix defined as argument and can be sorted
faster on weak hardware. Note: This is just a split alternative.
Real world usage in a shell script with a file in which the first 10
characters are the equal in each line, the following 2 bytes are
evaluated for splitting:
split_for_sort TMPSFS 12 raw_data.txt
for f in TMPSFS ; do
sort -o "${f}_sorted" -u "${f}"
done
\# Rely on the argument resolution to go with lexical order
cat TMPSFS*_sorted > sorted_data.txt
rm TMPSFS*
Diffstat (limited to 'include')
| -rw-r--r-- | include/hex_conversion.h | 96 | ||||
| -rw-r--r-- | include/output.h | 20 |
2 files changed, 116 insertions, 0 deletions
diff --git a/include/hex_conversion.h b/include/hex_conversion.h new file mode 100644 index 0000000..b31216d --- /dev/null +++ b/include/hex_conversion.h @@ -0,0 +1,96 @@ +/* + * vim:ts=4:sw=4:expandtab + */ +#ifndef HEX_CONVERSION_H +#define HEX_CONVERSION_H + +#include <stdlib.h> +#include <string.h> +#include <ctype.h> + +#ifdef DEBUGBUILD +#include "output.h" +#endif + +#define ishex_macro(c) ((c>='0' && c <= '9') || (c>='A' && c <= 'F') || (c>='a' && c <= 'f')) + +int convert_line(char *s); +int ishex(char c); +char *convert_to_binary(char *hex, char *out); +char *convert_from_binary(char *bin, size_t l, char *out); + +/* short inline functions are fine in header */ +inline int convert_line(char *s) { + size_t i = 0, l = 0; + if (s == NULL) + return -1; + l=strlen(s); + for (i=0; i<l; i++) { + s[i] = toupper(s[i]); + if ((s[i] == '\r' && (l-i<3)) || (s[i] == '\n' && i==l-1)) { + s[i] = '\0'; + break; + } + } + return 0; +} + +inline int ishex(char c) { + if ((c>='0' && c <= '9') || (c>='A' && c <= 'F') || (c>='a' && c <= 'f')) { + return 1; + } + return 0; +}; + +inline char *convert_to_binary(char *hex, char *out) { + char tmp[3] = {0,0,0}; + size_t length, i; + if (hex == NULL) return NULL; + length=strlen(hex); + if ( (length==0) || (length%2 == 1)) return NULL; + for (i=0; i<length; i++) { + if ( ! ishex_macro(hex[i]) ) { +#ifdef DEBUGBUILD + LOGERR("Incompatible string '%s'\n", hex); +#endif + return NULL; + } + } + if (out == NULL && ((out = calloc((length/2),sizeof(char))) == NULL)) { +#ifdef DEBUGBUILD + LOGERR("ERROR: Failed to allocate %d bytes\n", (length/2)); +#endif + return NULL; + } + for (i=0;i<length;i+=2) { + tmp[0] = hex[i]; + tmp[1] = hex[i+1]; + out[i/2] = (char) strtol(tmp, NULL, 16); + } + return out; +} + +/* Use a large buffer and complex method, as with a simple + * way there regularly were corrupt results with gcc -O2. */ +inline char *convert_from_binary(char *bin, size_t l, char *out) { + char tmp[24]; + size_t i,pos; + if (bin == NULL || l == 0) return NULL; + if (out == NULL && (out = calloc(((l*2)+1),sizeof(char))) == NULL) { +#ifdef DEBUGBUILD + LOGERR("ERROR: Failed to allocate %lu bytes\n", ((l*2)+1)); +#endif + return NULL; + } + for (i=0; i<l; i++) { + /* Keep in mind this format is not only two characters. */ + sprintf(tmp, "%02X", (unsigned char)bin[i]); + pos=strlen(tmp); + out[i*2] = tmp[pos-2]; + out[(i*2)+1] = tmp[pos-1]; + } + return out; +} + +#endif + diff --git a/include/output.h b/include/output.h new file mode 100644 index 0000000..efc7487 --- /dev/null +++ b/include/output.h @@ -0,0 +1,20 @@ +/* + * vim:ts=4:sw=4:expandtab + */ +#ifndef OUTPUT_H +#define OUTPUT_H + +#include <stdio.h> + +#ifndef LOGERR +#define LOGERR(...) {fprintf(stderr, "[%s:%d] %s: ", __FILE__, __LINE__, __func__); fprintf(stderr, __VA_ARGS__);} +#endif + +#ifdef DEBUGBUILD +#define DBGTRC(...) LOGERR(__VA_ARGS__) +#else +#define DBGTRC(...) +#endif + +#endif + |
