diff options
Diffstat (limited to 'src/mem_internal_check.c')
| -rw-r--r-- | src/mem_internal_check.c | 631 |
1 files changed, 156 insertions, 475 deletions
diff --git a/src/mem_internal_check.c b/src/mem_internal_check.c index ac881b9..e3b60a5 100644 --- a/src/mem_internal_check.c +++ b/src/mem_internal_check.c @@ -21,6 +21,7 @@ #include "output.h" #include "hex_conversion.h" +#include "time_utils.h" #ifndef LINELENGTH @@ -36,442 +37,212 @@ struct data_field { unsigned char data[FULL_DATABLOCK_SIZE]; /* keep this as first element for bsearch */ +#ifdef RAW_DATA_IN_MEMORY + /* In order to work with streams, the filters and not the raw data is loaded, therefore + * the field is not required. Keeping it as brain trigger in case this changes in the + * future. */ bool active; +#endif }; -struct list_node { - struct list_node *next; - struct data_field *field; -}; - -struct list_head { - struct list_node *first_node; - struct list_node *last_node; - size_t size; -}; - -struct list_collection { - struct list_head *heads[256]; -}; - - struct data_array { - struct data_field **field; - size_t entry_points[256]; /* keep constant */ - size_t opt_entry_points[256]; /* increase if referred target is deactivated */ - size_t array_length; + struct data_field *fields; + size_t length; + size_t entry_points[256]; /* performance optimization reduces search area*/ }; -void apply_filter_to_array(struct data_array *array, unsigned char *data); +void apply_filter_to_array(struct data_field *array, unsigned char *data); int compare_data_fields(const void *a, const void *b); -struct data_array *convert_collection_to_array(struct list_collection *collection); -struct list_collection *create_collection(); -struct list_head *create_list_head(); -struct list_node *create_list_node(unsigned char *data); -void destroy_collection(struct list_collection *c); -void destroy_data_array(struct data_array *da); -void destroy_list(struct list_head *head); -void fprint_array(FILE *fd, struct data_array *da); -struct list_collection *import_file_into_collection(char *filename); -bool insert_into_sorted_collection(struct list_collection *collection, unsigned char *data); -bool run_filter_file_on_array(struct data_array *da, char *filename); +int compare_string_data_field(const void *a, const void *b); +void fprint_array(FILE *fd, struct data_array *array); +struct data_array *import_file_into_array(char *filename); +bool is_valid_raw_data(char *s, bool cut); +bool run_file_on_filter_array(struct data_array *da, char *filename, FILE *output); bool search_data_in_array(struct data_array *da, unsigned char *data); -bool verify_data_array(struct data_array *da); -bool verify_list_collection(struct list_collection *lc); - -#ifdef DEBUGBUILD -void print_collection_state(struct list_collection *coll); -#endif +void sort_array(struct data_array *da); /* === IMPLEMENTATION === */ -inline struct list_collection *create_collection() { - int i; - struct list_collection *c; - - if ((c=calloc(1, sizeof(struct list_collection))) == NULL) { - return NULL; - } - for (i=0; i<256; i++) { - c->heads[i] = create_list_head(); - if (c->heads[i] == NULL) { break; } - } - DBGTRC("DEBUG: %d lists initialized%s\n", i, - (i<256)?", aborting as not all 256 where" : ""); - return c; -} - - -inline struct list_head *create_list_head() { - struct list_head *head = calloc(1, sizeof(struct list_head)); - if (head == NULL) { - /* If the following masked fprintf calls fail there was a chance */ - LOGERR("ERROR: Failed to allocate memory for list_head: "); - fputs(strerror(errno), stderr); - fputc('\n', stderr); - return NULL; - } - head->first_node = NULL; - head->last_node = NULL; - head->size = 0; - return head; -} - -/* bsearch and similar */ +/* bsearch, qsort etc. */ int compare_data_fields(const void *a, const void *b) { return memcmp(((struct data_field*)a)->data, ((struct data_field*)b)->data, FULL_DATABLOCK_SIZE); } -inline bool search_data_in_array(struct data_array *array, unsigned char *data) { - struct data_field *df, tmp_df; - memcpy(tmp_df.data, data, FULL_DATABLOCK_SIZE); - /* Currently not working */ - df = bsearch(&tmp_df, array->field, - array->array_length, sizeof(struct data_field), - compare_data_fields); - if (df != NULL) { - return true; - } - return false; -} - - -inline void destroy_collection(struct list_collection *c) { - int i; - if (c == NULL) return; - for (i=0; i<256; i++) { - destroy_list(c->heads[i]); - } - free(c); -} - - -inline void destroy_list(struct list_head *head) { - struct list_node *ptr=NULL; - if (head == NULL) return; - while (head->first_node != NULL) { - ptr = head->first_node; - head->first_node = head->first_node->next; - if (ptr->field != NULL) - free(ptr->field); - free(ptr); - } - free(head); - head = NULL; -} - -inline void destroy_data_array(struct data_array *da) { - size_t i=0; - if (da == NULL) return; - for (;i<da->array_length; i++) { - if (da->field[i] != NULL) { - free(da->field[i]); - } - } - free(da->field); - free(da); -} - -inline struct list_node *create_list_node(unsigned char *data) { - struct list_node *node; - if (data == NULL) return NULL; - if ((node=calloc(1,sizeof(struct list_node))) == NULL) { - LOGERR("ERROR: Failed to allocate a few bytes for new list_node\n"); - return NULL; - } - if ((node->field=calloc(1,sizeof(struct data_field))) == NULL) { - LOGERR("ERROR: Failed to allocate a few bytes for new data_field\n"); - free(node); - return NULL; - } - node->next = NULL; - node->field->active = true; - memcpy(node->field->data, data, FULL_DATABLOCK_SIZE); - return node; +int compare_string_data_field(const void *a, const void *b) { + return memcmp((unsigned char*)a, + ((struct data_field*)b)->data, FULL_DATABLOCK_SIZE); } -/* If the input data's already sorted best performance, worst case (O^n)/256 if it's reverse sorted. */ -inline bool insert_into_sorted_collection(struct list_collection *collection, unsigned char *data) { - struct list_node *node_ptr = NULL, *tmp_node = NULL; - struct list_head *head_ptr = NULL; - int cmp = 0; -#ifdef DEBUGBUILD - static size_t fcall_count = 0; -#endif +inline bool search_data_in_array(struct data_array *array, unsigned char *data) { + struct data_field *df; + size_t entry = 0, rest_length = array->length; - if (collection == NULL || data == NULL) { - DBGTRC("ERROR: collection set%s / data set%s\n", - ((collection==NULL) ? " NULL" : ""), - ((data==NULL) ? " NULL" : "")); + if (array == NULL || data == NULL) { + LOGERR("ERROR: Missing argument%s%s\n", + (array==NULL) ? " array" : "", + (data ==NULL) ? " data" : ""); return false; } -#ifdef DEBUGBUILD - DBGTRC("DEBUG: fcall_count = %lu / data[0]=0x%02X / decimal: data[0]=%3u\n", - ++fcall_count, data[0], (unsigned int)data[0]); -#endif - - head_ptr = collection->heads[data[0]]; - if (head_ptr->first_node == NULL) { - DBGTRC("DEBUG: First node for list %d\n", data[0]); - if ((tmp_node = create_list_node(data)) == NULL) { - return false; - } - head_ptr->first_node = tmp_node; - head_ptr->last_node = tmp_node; - head_ptr->size = 1; - return true; + entry = array->entry_points[data[0]]; + if (data[0] == 255 || array->entry_points[data[0]+1] == 0) { + rest_length = array->length - entry; + } else { + rest_length = array->entry_points[data[0]+1] - entry; } - cmp = memcmp(data, head_ptr->last_node->field->data, FULL_DATABLOCK_SIZE); - if (cmp == 0) { - /* duplicate, skip */ - return true; - } + DBGTRC("DEBUG: data[0] = %d (dec) / entry = %lu / rest_length = %lu\n", + data[0], entry, rest_length); - /* Append to list */ - if (cmp > 0) { - if ((tmp_node = create_list_node(data)) == NULL) { - return false; - } - head_ptr->last_node->next = tmp_node; - head_ptr->last_node = tmp_node; - head_ptr->size++; - return true; - } + df = bsearch(data, &(array->fields[entry]), + rest_length, sizeof(struct data_field), + compare_string_data_field); - node_ptr = head_ptr->first_node; - cmp = memcmp(data, node_ptr->field->data, FULL_DATABLOCK_SIZE); - if (cmp == 0) { return true; } - /* First to enter? Prepend to list*/ - if (cmp < 0) { - if ((tmp_node = create_list_node(data)) == NULL) { - return false; - } - tmp_node->next = head_ptr->first_node; - head_ptr->first_node = tmp_node; - head_ptr->size++; + /* + df = bsearch(data, array->fields, + array->length, sizeof(struct data_field), + compare_string_data_field); + */ + if (df != NULL) { return true; } - - /* Sloooow part... */ - while (node_ptr->next != NULL) { - /* check next in list */ - cmp = memcmp(data, node_ptr->next->field->data, FULL_DATABLOCK_SIZE); - if (cmp == 0) { return true; } - /* insert by prepend */ - if (cmp > 0) { - if ((tmp_node = create_list_node(data)) == NULL) { - return false; - } - tmp_node->next = node_ptr->next; - node_ptr->next = tmp_node; - head_ptr->size++; - return true; - } - /* continue */ - node_ptr = node_ptr->next; - } - LOGERR("ERROR: Could not insert dataset into list, missed possible situation...\n"); return false; } -inline struct data_array *convert_collection_to_array(struct list_collection *collection) { - int i; - size_t s; - struct data_array *da = NULL; - struct list_node *ptr, *tmp; - - if (collection == NULL) { - LOGERR("ERROR: No collection given.\n"); - return NULL; - } - - s = 0; - for (i=0; i<256; i++) { -#ifdef DEBUGBUILD - if (collection->heads[i]->size > 0) { - DBGTRC("DEBUG: collections->heads[%d]->size = %lu\n", - i, collection->heads[i]->size); - } -#endif - s += collection->heads[i]->size; - DBGTRC("DEBUG: collection->heads[%d]->size = %lu\n", i, collection->heads[i]->size); - } - DBGTRC("DEBUG: Total size %lu\n", s); - - if (s == 0) { - LOGERR("ERROR: Empty collection."); - return NULL; - } +inline void sort_array(struct data_array *da) { + size_t i = 0, p = 0; + struct data_field *f; - if ((da = calloc(1, sizeof(struct data_array))) == NULL) { - LOGERR("ERROR: Failed to allocate %lu bytes...\n", sizeof(struct data_array)); - return NULL; - } + if (da == NULL) return; - da->array_length = s; - if ((da->field=calloc(da->array_length, sizeof(struct data_field*))) == NULL) { - free(da); - LOGERR("ERROR: Failed to allocate memory for metadata\n"); - return NULL; - } + qsort(da->fields, da->length, sizeof(struct data_field), compare_data_fields); - s = 0; - for (i=0; i<256; i++) { - DBGTRC("DEBUG: i=%d / s=%lu\n", i, s); - ptr = collection->heads[i]->first_node; - da->entry_points[i] = s; - da->opt_entry_points[i] = s; -#ifdef DEBUGBUILD - if (ptr != NULL) { - DBGTRC("DEBUG: list %d in collection, claims size %lu\n", i, - collection->heads[i]->size); - } -#endif - while (ptr != NULL) { - DBGTRC("DEBUG: Setting field %lu to %s\n", s, - (ptr->field == NULL) ? "NULL" : "valid value"); - da->field[s] = ptr->field; - s += 1; /* field[s++] failed... compiler optimization? */ - tmp = ptr->next; - ptr->next = NULL; - free(ptr); - ptr = tmp; + /* set entry points */ + f = da->fields; + for (i=1,p=0; i<da->length; i++, p++) { + if (f[p].data[0] != f[i].data[0]) { + da->entry_points[f[i].data[0]] = i; + if (f[i].data[0] == 255) { + break; + } } - collection->heads[i]->first_node = NULL; - collection->heads[i]->last_node = NULL; - collection->heads[i]->size = 0; } - - destroy_collection(collection); - - return da; } -/* This is acceptable if the largest can be hold in memory, but not the other way */ -inline void apply_filter_to_array(struct data_array *array, unsigned char *data) { - size_t pos=0, limit = 0; - int cmp; - +inline bool is_valid_raw_data(char *s, bool cut) { + size_t length = 0; + if (s == NULL) return false; - if (array==NULL || data==NULL) { - return; - } + length = strlen(s); + if (length < NON_CONVERTED_SIZE) + return false; - /* Manual search with entry_points */ - if (data[0] == 255) { - limit = array->array_length; + if (s[NON_CONVERTED_SIZE] == '\n') { + length = NON_CONVERTED_SIZE; } else { - limit = array->entry_points[data[0]+1]; - } - - pos=array->opt_entry_points[data[0]]; - DBGTRC("DEBUG: opt_entry_point=%-8lu / limit=%-8lu\n", pos, limit); - DBGTRC("DEBUG: data[0]=0x%02X / decimal: data[0]=%3u\n", - data[0], (unsigned int)data[0]); - /* TODO: Evaluate whether this loop should be handlued with openmp */ - for (; pos<limit; pos++) { - DBGTRC("DEBUG: pos=%lu\n", pos); - if (array->field[pos]->active) { - /* perf optimization, single byte faster than memcmp on 20 bytes */ - if (data[1] < array->field[pos]->data[1]) { - continue; - } - cmp = memcmp(data, array->field[pos]->data, FULL_DATABLOCK_SIZE); - DBGTRC("cmp: %d\n", cmp); - if (cmp == 0) { - array->field[pos]->active = false; - if (pos == array->opt_entry_points[data[0]]) { - array->opt_entry_points[data[0]]++; - } - return; - } -#if 1 - else if (data[1] > array->field[pos]->data[1]) { - return; - } -#else - /* Currently I don't trust this memcmp result 100%. testdata created in shell - * for i in $(seq 1 20000); do echo "foo_$i" | sha1sum | awk ' { print $1; } ' ; done - * picking a random subset from there to be used as filters lead to sometimes leaving - * the loop too early with this else... Comparing only the second byte may is also too - * vague with really large datasets */ - else if (cmp < 0) { - /* array content is sorted, so abort once the possible match has been passed */ - return; - } -#endif + while (isspace(s[length-1])) { + length--; } + if (length != NON_CONVERTED_SIZE) + return false; } + + if ( ! ishex_string(s,NON_CONVERTED_SIZE)) + return false; + + if (cut) + s[length] = '\0'; + + return true; } -struct list_collection *import_file_into_collection(char *filename) { +struct data_array *import_file_into_array(char *filename) { FILE *fd = NULL; char line[LINELENGTH]; unsigned char data[FULL_DATABLOCK_SIZE]; - size_t line_nr = 0; - struct list_collection *collection = NULL; + size_t line_nr = 0, fpos = 0; + struct data_array *array = NULL; if (filename == NULL || filename[0] == '\0') { LOGERR("ERROR: no filename given.\n"); return NULL; } - if ((collection = create_collection()) == NULL) { - LOGERR("ERROR: Failed to allocate a few bytes for collection.\n"); + if ((fd = fopen(filename, "r")) == NULL) { + LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", + filename, strerror(errno), errno); return NULL; } + /* Count the number of valid lines */ + while (fgets(line, LINELENGTH, fd) != NULL) { + if (is_valid_raw_data(line, false)) { + line_nr++; + } + } + fclose(fd); + + /* only complete close and open worked reliably, let's just hope nobody modified the file + * TODO: compare stat() mtime? */ if ((fd = fopen(filename, "r")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", filename, strerror(errno), errno); - destroy_collection(collection); return NULL; } - while (fgets(line, LINELENGTH, fd) != NULL) { - line_nr++; - DBGTRC("DEBUG: line %lu = %s", line_nr, line); - if (convert_line(line)<0) { - LOGERR("ERROR: Failed to prepare line %lu for conversion.\n", line_nr); - destroy_collection(collection); - fclose(fd); - return NULL; + if ((array=calloc(1,sizeof(struct data_array))) == NULL) { + fclose(fd); + LOGERR("ERROR: Failed to allocate <32 bytes...\n"); + return NULL; + } + array->length = line_nr; + + if ((array->fields=calloc(array->length,sizeof(struct data_field))) == NULL) { + fclose(fd); + LOGERR("ERROR: Failed to allocate %lu bytes...\n", (array->length * sizeof(struct data_field))); + return NULL; + } + DBGTRC("DEBUG: array->fields[] size is %lu bytesi\n", (array->length * sizeof(struct data_field))); + + + while ((fgets(line, LINELENGTH, fd) != NULL) && + fpos < array->length) { + if ( ! is_valid_raw_data(line, true)) { + continue; } if (convert_to_binary(line, data) == NULL) { - destroy_collection(collection); + LOGERR("ERROR: Failed to convert line %lu into binary data. '%s'\n", line_nr, line); + free(array->fields); + free(array); fclose(fd); - LOGERR("ERROR: Failed to convert line %lu into binary data.\n", line_nr); return NULL; } - if (!insert_into_sorted_collection(collection, data)) { - destroy_collection(collection); - fclose(fd); - LOGERR("ERROR: Failed to insert line %lu in binary form into collection.\n", line_nr); - return NULL; - } + memcpy(array->fields[fpos].data, data, FULL_DATABLOCK_SIZE); +#ifdef RAW_DATA_IN_MEMORY + array->fields[fpos].active = true; +#endif + fpos++; } - DBGTRC("DEBUG: lines read %lu\n", line_nr); + DBGTRC("DEBUG: valid lines read %lu\n", line_nr); fclose(fd); - return collection; + return array; } -bool run_filter_file_on_array(struct data_array *da, char *filename) { - FILE *fd = NULL; +bool run_file_on_filter_array(struct data_array *da, char *filename, FILE *output) { + FILE *fd = NULL, *fdout = output; size_t line_nr = 0; char line[LINELENGTH]; unsigned char data[FULL_DATABLOCK_SIZE]; @@ -480,6 +251,10 @@ bool run_filter_file_on_array(struct data_array *da, char *filename) { LOGERR("ERROR: compromised arguments at call.\n"); return false; } + + if (output == NULL) + fdout = stdout; + if ((fd=fopen(filename, "r")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", filename, strerror(errno), errno); @@ -490,15 +265,19 @@ bool run_filter_file_on_array(struct data_array *da, char *filename) { while (fgets(line, LINELENGTH, fd) != NULL) { line_nr++; DBGTRC("READ LINE %lu: %s", line_nr, line); - if (convert_line(line)<0) { - LOGERR("ERROR: Failed to prepare line %lu for conversion.\n", line_nr); + if ( ! is_valid_raw_data(line, true) ) { continue; } if (convert_to_binary(line, data) == NULL) { LOGERR("ERROR: Failed to convert line %lu into binary data.\n", line_nr); continue; } - apply_filter_to_array(da, (unsigned char*)data); + + if ( ! search_data_in_array(da, data) ) { + convert_line(line); /* for us only a toupper() loop */ + fputs(line, fdout); + fputc('\n', fdout); + } } fclose(fd); @@ -509,117 +288,29 @@ bool run_filter_file_on_array(struct data_array *da, char *filename) { void fprint_array(FILE *fd, struct data_array *da) { FILE *fdout = stdout; - char *plaintext = NULL; + char plaintext[NON_CONVERTED_SIZE+1]; size_t pos; if (da == NULL) { return; } if (fd != NULL) { fdout = fd; } - DBGTRC("DEBUG: da->array_length = %lu\n", da->array_length); - for (pos=0; pos < da->array_length; pos++) { - DBGTRC("DEBUG: pos %lu --- %s\n", pos, (da->field[pos] == NULL)? "NULL" : "element exists"); - if (da->field[pos]->active) { - plaintext = convert_from_binary(da->field[pos]->data, FULL_DATABLOCK_SIZE, plaintext); - fputs(plaintext, fdout); - fputc('\n', fdout); - } - } - if (plaintext != NULL) { free(plaintext); }; -} - -/* To be used on a new array */ -bool verify_data_array(struct data_array *da) { - bool res = true; - size_t p = 0; - int repeated_error = 0; - - if (da == NULL) { - LOGERR("No array given...\n"); - return false; - } - - /* Check entry points */ - for (p=0; p<256; p++) { - if (da->entry_points[p] != da->opt_entry_points[p]) { - LOGERR("MISMATCH entry_points[%lu] <-> opt_entry_points[%lu]\n", p, p); - res = false; - } - } - - for (p=0; p<da->array_length; p++) { - if (da->field[p] == NULL) { - LOGERR("ERROR: field %lu of %lu is NULL.\n", p, da->array_length); - res = false; - repeated_error++; - if (repeated_error > 5) - return res; - } - } - - return res; -} - - -bool verify_list_collection(struct list_collection *lc) { - bool res = true; - int i; - size_t count; - struct list_node *ptr; - - for (i=0; i<256; i++) { - count = 0; - ptr = lc->heads[i]->first_node; - - while (ptr != NULL) { - count++; - ptr = ptr->next; - } - - LOGERR("DEBUG: list %3d/0x%02X: size %lu, counted %lu => does%s match.\n", - i, i, lc->heads[i]->size, count, - (lc->heads[i]->size != count)?" NOT":""); - if (lc->heads[i]->size != count) { - res = false; - } + plaintext[NON_CONVERTED_SIZE] = '\0'; + for (pos=0; pos < da->length; pos++) { + convert_from_binary(da->fields[pos].data, FULL_DATABLOCK_SIZE, plaintext); + fputs(plaintext, fdout); + fputc('\n', fdout); } - - return res; } -#ifdef DEBUGBUILD -void print_collection_state(struct list_collection *coll) { - int i = 0; - struct list_head *h = NULL; - struct list_node *ptr = NULL; - - if (coll == NULL) { - LOGERR("ERROR: No collection given\n"); - return; - } - - for (i=0; i<256; i++) { - h = coll->heads[i]; - ptr = h->first_node; - if (h->size == 0 || ptr == NULL) { - if (!(h->size == 0 && ptr == NULL)) { - DBGTRC("DEBUG: ERROR item %3d not matching size %lu, first_node %p\n", i, h->size, h->first_node); - } - continue; - } - DBGTRC("DEBUG: List %3d, size %lu, first_node %p\n", i, h->size, h->first_node); - } - -} -#endif - int main(int argc, char **argv) { FILE *output = NULL; - int i = 0; - size_t s = 0; - struct list_collection *collection; + int i = 3; + size_t s = 0; struct data_array *array; + struct timespec t1, t2, tdiff; if (argc < 3) { - fprintf(stderr, "Usage: %s output raw_data filter_files...\n", argv[0]); + fprintf(stderr, "Usage: %s output filter_file data_file...\n\n", argv[0]); + fprintf(stderr, "Loads filters into memory, does NOT remove duplicates\n"); return EXIT_FAILURE; } @@ -635,40 +326,30 @@ int main(int argc, char **argv) { } LOGERR("IMPORT FILE %s\n", argv[2]); - collection = import_file_into_collection(argv[2]); - if (collection == NULL) { - return EXIT_FAILURE; - } -/* TODO: Hide this behind some option - if ( ! verify_list_collection(collection) ) { - return EXIT_FAILURE; - } -*/ - array = convert_collection_to_array(collection); + array = import_file_into_array(argv[2]); if (array == NULL) { return EXIT_FAILURE; } - collection = NULL; -/* TODO: Hide this behind an option - if ( ! verify_data_array(array) ) { - return EXIT_FAILURE; - } -*/ + LOGERR("run qsort on in-memory data\n"); + TU_MEASURE_TIME( CLOCK_PROCESS_CPUTIME_ID, &t1, &t2, + sort_array(array); + ); + difftime_timespec(t1, t2, &tdiff); + LOGERR("Sorted in %lu s and %lu ns process CPU time\n", tdiff.tv_sec, tdiff.tv_nsec ); for (i=3; i<argc; i++) { - LOGERR("APPLY FILTER FILE %s\n", argv[i]); - if (!run_filter_file_on_array(array, argv[i])) { + LOGERR("APPLY FILTER ON FILE %s\n", argv[i]); + if (!run_file_on_filter_array(array, argv[i], output)) { LOGERR("ERROR: Aborting.\n"); return EXIT_FAILURE; } } - DBGTRC("DEBUG: Filter applied, final result:\n"); - fprint_array(output, array); fflush(output); fclose(output); - destroy_data_array(array); + free(array->fields); + free(array); return EXIT_SUCCESS; } |
