From 2283d88d98c9aa9edf5df926c39c0b7040308d41 Mon Sep 17 00:00:00 2001 From: Jeffrey Picard Date: Sun, 15 Feb 2015 21:09:43 +0000 Subject: [PATCH] Add join utility. --- Makefile | 1 + join.c | 425 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 426 insertions(+), 0 deletions(-) create mode 100644 join.c diff --git a/Makefile b/Makefile index ba5d96f..708b607 100644 --- a/Makefile +++ b/Makefile @@ -98,6 +98,7 @@ BIN =\ grep\ head\ hostname\ + join\ kill\ link\ ln\ diff --git a/join.c b/join.c new file mode 100644 index 0000000..992b6fc --- /dev/null +++ b/join.c @@ -0,0 +1,425 @@ +/* See LICENSE file for copyright and license details. */ +#include +#include +#include +#include + +#include "util.h" + +static int tflag = 0; +static int aflag = 0; +static int eflag = 0; +/*static int oflag = 0;*/ +static int oneflag = 0; +static int twoflag = 0; + +static char *estr = ""; +static char *empty_str = ""; +static int join_index1 = 1; +static int join_index2 = 1; +static int delim = ' '; +static int (*is_delim)(int); +static int afile; +static int print_unjoinable1 = 0; +static int print_unjoinable2 = 0; + +struct line_list_t { + int cur_lines; + int capacity; + struct line_t *lines; + struct line_t *next; +}; + +struct line_t { + char *fields; + char *join_field; + int num_fields; + size_t len; + size_t buf_size; + int f_idx; + int iter_capacity; + char **iter_fields; + struct line_t *link; +}; + +static void +usage(void) +{ + eprintf("usage: %s file1 file2\n", argv0); +} + +static int +delim_func(int c) +{ + return c == delim; +} + +static inline char * +get_next_field_line(struct line_t *line, int join_index) +{ + if (line->f_idx == join_index - 1) + line->f_idx++; + if (line->f_idx > line->num_fields - 1) + return 0; + return line->iter_fields[line->f_idx++]; +} + +static void +free_line_list(struct line_list_t *list) +{ + struct line_t *tofree = 0; + struct line_t *line = list->lines; + + if (list->next && list->next != list->lines) { + free(list->next->fields); + free(list->next); + } + + while (line) { + tofree = line; + line = line->link; + free(tofree->fields); + free(tofree->iter_fields); + free(tofree); + } +} + +void +resize_iter(struct line_t *line) { + size_t new_size; + + line->iter_capacity = line->iter_capacity * 2; + new_size = line->iter_capacity * sizeof(*line->iter_fields); + line->iter_fields = realloc(line->iter_fields, new_size); + if (!line->iter_fields) + enprintf(2, "malloc: "); +} + +struct line_t * +create_line(void) +{ + struct line_t *new = calloc(1, sizeof(struct line_t)); + if (!new) + enprintf(2, "malloc: "); + new->iter_capacity = 10; + new->iter_fields = calloc(10, sizeof(*new->iter_fields)); + if (!new->iter_fields) + enprintf(2, "malloc: "); + return new; +} + +void +process_fields(struct line_t *line, int ji) +{ + int num_fields = 1; + int i = 0; + char *fields = line->fields; + int len = line->len; + line->join_field = 0; + + if (len == 0) { + line->join_field = empty_str; + line->num_fields = 0; + return; + } + + if (!tflag) { + while(is_delim(*fields)) { + fields++; + i++; + } + } + + line->iter_fields[0] = i < len ? fields : 0; + + while(i < len) { + if (is_delim(*fields)) { + num_fields++; + i++; + *fields++ = '\0'; + /* in the default case (whitespace) multiple delimiters + * count as one */ + if (!tflag) { + while(i < len && is_delim(*fields)) { + *fields++ = '\0'; + i++; + } + } + if (num_fields < line->iter_capacity) { + line->iter_fields[num_fields-1] = fields; + } else { + resize_iter(line); + line->iter_fields[num_fields-1] = fields; + } + } else { + if (!line->join_field && num_fields == ji) + line->join_field = fields; + fields++; + i++; + } + } + + if (!line->join_field) + line->join_field = empty_str; + line->num_fields = num_fields; +} + +static int +get_line(FILE *fp, struct line_t *line, int ji) +{ + line->len = getline(&line->fields, &line->buf_size, fp); + if (-1 == line->len) + return line->len; + + /* We may not have an eol char at the end of a file*/ + if (line->fields[line->len-1] == '\n') { + line->len--; + line->fields[line->len] = '\0'; + } + process_fields(line, ji); + + return line->len; +} + +int +get_line_list(FILE *fp, struct line_list_t *list, int ji) +{ + int ret = 0; + int i = 0; + struct line_t *next = 0; + struct line_t *cur = 0; + + if (0 == list->capacity) { + next = create_line(); + list->lines = next; + list->capacity++; + } + + list->cur_lines = 1; + cur = list->lines; + for (i = 1; ; i++) { + + if (i >= list->capacity) { + next = create_line(); + cur->link = next; + list->capacity++; + } else { + next = cur->link; + } + + ret = get_line(fp, next, ji); + if (-1 == ret) break; + + /* we assume that if this function was called, a match on the + * join_field was found, so cur->join_field is not null, but + * next->join_field could be */ + if (next->join_field && + strcmp(cur->join_field, next->join_field) == 0) { + cur = next; + list->cur_lines++; + } else { + cur->link = next->link; + list->next = next; + next->link = list->lines; + break; + } + } + + if (-1 == ret) { + list->next = 0; + return -1; + } + return ret; +} + +void +print_unjoinable(struct line_t *line, int join_index) +{ + char *field; + + fputs(line->join_field, stdout); + + line->f_idx = 0; + while ((field = get_next_field_line(line, join_index))) { + putchar(delim); fputs(field, stdout); + } + putchar('\n'); +} + +void +print_joined_lines(struct line_t *line1, struct line_t *line2) +{ + char *field; + + if (eflag && strlen(line1->join_field) == 0) + fputs(estr, stdout); + else + fputs(line1->join_field, stdout); + + line1->f_idx = 0; + while ((field = get_next_field_line(line1, join_index1))) { + if (eflag && strlen(field) == 0) + field = estr; + putchar(delim); fputs(field, stdout); + } + + line2->f_idx = 0; + while ((field = get_next_field_line(line2, join_index2))) { + if (eflag && strlen(field) == 0) + field = estr; + putchar(delim); fputs(field, stdout); + } + putchar('\n'); +} + +static int +join_field_cmp(struct line_t *line1, struct line_t *line2) +{ + if (line1->num_fields < join_index1) + return line2->num_fields < join_index2 ? 0 : -1; + if (line2->num_fields < join_index2) + return 1; + return strcmp(line1->join_field, line2->join_field); +} + +static int +join(FILE *fp1, FILE *fp2) +{ + size_t len1 = 0, len2 = 0; + int i, j; + int cmp; + struct line_t *walk1, *walk2; + struct line_t *line1, *line2; + struct line_list_t line_list1, line_list2; + + memset(&line_list1, 0, sizeof(line_list1)); + memset(&line_list2, 0, sizeof(line_list2)); + + line1 = create_line(); + line2 = create_line(); + + line_list1.lines = line1; + line_list1.capacity = line_list1.cur_lines = 1; + line_list2.lines = line2; + line_list2.capacity = line_list2.cur_lines = 1; + + len1 = get_line(fp1, line1, join_index1); + len2 = get_line(fp2, line2, join_index2); + + for (;;) { + if (-1 == len1 || -1 == len2) break; + + cmp = join_field_cmp(line1, line2); + if (cmp > 0) { + if (print_unjoinable2) + print_unjoinable(line2, join_index2); + len2 = get_line(fp2, line2, join_index2); + } else if (cmp < 0) { + if (print_unjoinable1) + print_unjoinable(line1, join_index1); + len1 = get_line(fp1, line1, join_index1); + } else { + + len1 = get_line_list(fp1, &line_list1, join_index1); + len2 = get_line_list(fp2, &line_list2, join_index2); + + walk1 = line_list1.lines; + for (i = 0; i < line_list1.cur_lines; i++, walk1 = walk1->link) { + walk2 = line_list2.lines; + for (j = 0; j < line_list2.cur_lines; j++, walk2 = walk2->link) { + print_joined_lines(walk1, walk2); + } + } + + line1 = line_list1.next; + line2 = line_list2.next; + if (!line_list1.next || !line_list2.next) break; + line_list1.lines = line_list1.next; + line_list2.lines = line_list2.next; + } + } + + /* Once we've finished with one file, check if the other still has + * lines and we want to print unjoinable lines */ + if ((-1 == len1 || !line_list1.next) && + -1 != len2 && + print_unjoinable2) { + do print_unjoinable(line2, join_index2); + while (get_line(fp2, line2, join_index2) != -1); + } + + if ((-1 == len2 || !line_list2.next) && + -1 != len1 && + print_unjoinable1) { + do print_unjoinable(line1, join_index1); + while (get_line(fp1, line1, join_index1) != -1); + } + + free_line_list(&line_list1); + free_line_list(&line_list2); + + return 0; +} + +int +main(int argc, char *argv[]) +{ + FILE *fp1; + FILE *fp2; + int ret; + + ARGBEGIN { + case 't': + tflag = 1; + delim = EARGF(usage())[0]; + break; + case '1': + oneflag = 1; + join_index1 = strtoul(EARGF(usage()), 0, 10); + break; + case '2': + twoflag = 1; + join_index2 = strtoul(EARGF(usage()), 0, 10); + break; + case 'e': + eflag = 1; + estr = EARGF(usage()); + break; + case 'a': + aflag = 1; + afile = strtoul(EARGF(usage()), 0, 10); + break; + case 'o': + eprintf("Not implemented\n"); + default: + usage(); + } ARGEND; + + if (argc != 2) + usage(); + + if (!(fp1 = fopen(argv[0], "r"))) { + weprintf("fopen %s:", argv[0]); + return 1; + } + if (!(fp2 = fopen(argv[1], "r"))) { + weprintf("fopen %s:", argv[1]); + return 1; + } + + is_delim = tflag ? delim_func : isblank; + + if (aflag && 1 == afile) + print_unjoinable1 = 1; + else if (aflag && 2 == afile) + print_unjoinable2 = 1; + else if (aflag) + eprintf("join: file number '%d' invalid\n", afile); + + ret = join(fp1, fp2); + + fclose(fp1); + fclose(fp2); + + return ret; +} -- 1.7.1