From 3d2fd7c1366d8167041fe89a57a9bacaa1ae7f22 Mon Sep 17 00:00:00 2001
From: Charlie Gordon <github@chqrlie.org>
Date: Mon, 31 Mar 2025 17:39:27 +0200
Subject: [PATCH] c2cat: improve output consistency

* output tokens with the original spelling
* use Style enum to support custom colors
* show syntax errors
* add `--color` and `--nocolor` to force/disable color output
---
 ast_utils/color-custom.c2 | 131 +++++++++++
 parser/c2_tokenizer.c2    |   5 +-
 recipe.txt                |   1 +
 tools/c2cat.c2            | 461 ++++++++++++++++++++------------------
 4 files changed, 374 insertions(+), 224 deletions(-)
 create mode 100644 ast_utils/color-custom.c2

diff --git a/ast_utils/color-custom.c2 b/ast_utils/color-custom.c2
new file mode 100644
index 00000000..ed298c90
--- /dev/null
+++ b/ast_utils/color-custom.c2
@@ -0,0 +1,131 @@
+/* Copyright 2025 Charlie Gordon
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+module color;
+
+import ctype local;
+import stdio local;
+import stdlib local;
+import string local;
+
+const char*[] standardColors = {
+    "black",        Black,
+    "red",          Red,
+    "green",        Green,
+    "yellow",       Yellow,
+    "blue",         Blue,
+    "magenta",      Magenta,
+    "cyan",         Cyan,
+    "grey",         Grey,
+    "darkgrey",     Darkgrey,
+    "bred",         Bred,
+    "bgreen",       Bgreen,
+    "byellow",      Byellow,
+    "bblue",        Bblue,
+    "bmagenta",     Bmagenta,
+    "bcyan",        Bcyan,
+    "white",        White,
+    "normal",       Normal,
+}
+
+fn bool getStyleDef(char* buf1, u32 size1, char* buf2, u32 size2, const char** pp) {
+    const char *p = *pp;
+    while (isspace(*p))
+        p++;
+    if (!*p)
+        return false;
+    u32 i = 0;
+    while (isalpha(*p) || *p == '.' || *p == '_') {
+        char c = (char)tolower(*p++);
+        if (i + 1 < size1)
+            buf1[i++] = c;
+    }
+    buf1[i] = '\0';
+    if (*p != '=' && *p != ':')
+        return false;
+    p++;
+    i = 0;
+    while (*p && *p != ' ' && *p != ',' && *p != ';') {
+        char c = (char)tolower(*p++);
+        if (i + 1 < size2 && c != '-' && c != '_')
+            buf2[i++] = c;
+    }
+    buf2[i] = '\0';
+    if (*p == ',' || *p == ';')
+        p++;
+    *pp = p;
+    return true;
+}
+
+fn bool matchColorName(const char *p, const char *name) {
+    while (*p) {
+        char c = *p++;
+        if (c == 'b' && !strncmp(p, "right", 5))
+            p += 5;
+        if (c != *name++)
+            return false;
+    }
+    return *name == '\0';
+}
+
+fn const char* convertColor(const char *val, const char *def) {
+    if (*val == '\0')
+        return "";
+
+    for (u32 i = 0; i < elemsof(standardColors); i += 2) {
+        if (matchColorName(val, standardColors[i]))
+            return standardColors[i + 1];
+    }
+    if (!strcasecmp(val, "default"))
+        return def;
+
+    char[32] buf;
+    i32 pal, r, g, b;
+    if (sscanf(val, "%*1[pP]%d", &pal) == 1) {
+        snprintf(buf, elemsof(buf), "\033[38;5;%dm", pal);
+    } else
+    if (sscanf(val, "#%2x%2x%2x", &r, &g, &b) == 3) {
+        snprintf(buf, elemsof(buf), "\033[38;2;%d;%d;%dm", r, g, b);
+    } else {
+        // TODO: complain about unknown color
+        return def;
+    }
+    return strdup(buf);
+}
+
+public fn void freeConfigColor(const char* p) {
+    if (p && *p) {
+        for (u32 i = 0; i < elemsof(standardColors); i++) {
+            if (standardColors[i] == p) return;
+        }
+        free((void*)p);
+    }
+}
+
+public fn const char* getConfigColor(const char* cat, const char* def) {
+    const char *c2_colors = getenv("C2_COLORS");
+    if (c2_colors) {
+        const char *p = c2_colors;
+        char[16] style;
+        char[16] val;
+        if (!strcmp(p, "none"))
+            return "";
+        while (getStyleDef(style, elemsof(style), val, elemsof(val), &p)) {
+            if (!strcmp(style, cat))
+                return convertColor(val, def);
+        }
+    }
+    return def;
+}
diff --git a/parser/c2_tokenizer.c2 b/parser/c2_tokenizer.c2
index 7c0657a5..d2857eec 100644
--- a/parser/c2_tokenizer.c2
+++ b/parser/c2_tokenizer.c2
@@ -694,8 +694,7 @@ fn void Tokenizer.num_error(Tokenizer* t, Token* result, const char* p, const ch
     vsnprintf(t.error_msg, sizeof(t.error_msg), format, args);
     va_end(args);
 
-    // XXX: error position should be passed separately from token start
-    result.loc = t.loc_start + (SrcLoc)(p - t.input_start);
+    SrcLoc err_loc = t.loc_start + (SrcLoc)(p - t.input_start);
     // read the rest of the pp-number token
     for (;;) {
         if ((*p == 'e' || *p == 'E' || *p == 'p' || *p == 'P') && (p[1] == '+' || p[1] == '-')) {
@@ -712,7 +711,7 @@ fn void Tokenizer.num_error(Tokenizer* t, Token* result, const char* p, const ch
     }
     t.cur = p;
     result.len = (u16)((p - t.input_start) - (result.loc - t.loc_start));
-    if (t.on_warning) t.on_warning(t.fn_arg, result.loc);
+    if (t.on_warning) t.on_warning(t.fn_arg, err_loc);
 }
 
 fn void Tokenizer.lex_identifier(Tokenizer* t, Token* result) {
diff --git a/recipe.txt b/recipe.txt
index e28ea1eb..c4393d0c 100644
--- a/recipe.txt
+++ b/recipe.txt
@@ -378,6 +378,7 @@ executable c2cat
 	$backend c
 
     ast_utils/color.c2
+    ast_utils/color-custom.c2
     ast_utils/constants.c2
     ast_utils/number_radix.c2
     ast_utils/src_loc.c2
diff --git a/tools/c2cat.c2 b/tools/c2cat.c2
index 7c3aacba..a1229c00 100644
--- a/tools/c2cat.c2
+++ b/tools/c2cat.c2
@@ -19,43 +19,55 @@ import c2_tokenizer;
 import color;
 import file_utils;
 import keywords;
+import src_loc local;
 import string_buffer;
 import string_list;
 import string_pool;
 import number_radix;
 import token local;
 
-import ctype;
 import stdio local;
 import stdlib local;
 import string local;
 
-const char* col_keyword = color.Byellow;
-const char* col_type = color.Green;
-const char* col_feature = color.Blue;
-const char* col_attr = color.Blue;
-const char* col_identifier = "";
-const char* col_integer = color.Magenta;
-const char* col_float = color.Magenta;
-const char* col_charconst = color.Magenta;
-const char* col_string = color.Magenta;
-const char* col_comment = color.Bcyan;
-const char* col_invalid = color.Bred;
-const char* col_error = color.Bred;
-const char* col_normal = color.Normal;
+type Style enum u8 {
+    Normal,
+    Identifier,
+    Integer,
+    Float,
+    Charconst,
+    String,
+    Operator,
+    Type,
+    Keyword,
+    Function,
+    Attr,
+    Feature,
+    Invalid,
+    Comment,
+    Warning,
+    Error,
+}
 
 fn void usage(const char* me) {
-    printf("Usage: %s file.c2 ...\n", me);
+    printf("Usage: %s [options] file.c2 ...\n"
+           "    --color      force colorized output\n"
+           "    --nocolor    disable colorized output\n"
+           , me);
     exit(1);
 }
 
 type C2cat struct {
     string_pool.Pool* pool;
-    string_buffer.Buf* out;
     c2_tokenizer.Tokenizer* tokenizer;
     const char* input;
     u32 offset;
-    u32 in_attributes; // 0 no, 1 seen @, 2 (, ) -> 0
+    u32 length;
+    string_buffer.Buf* out;
+    u8 in_attributes; // 0 no, 1 seen @, 2 (, ) -> 0
+    bool has_error;
+    Style* token_style;
+    const char** style_color;
 }
 
 const char*[] attr_names = {
@@ -80,27 +92,107 @@ const char*[] attr_names = {
     "auto_func",
 }
 
+const char*[] default_colors = {
+    [Style.Normal] = color.Normal,
+    [Style.Identifier] = "",
+    [Style.Integer] = color.Magenta,
+    [Style.Float] = color.Magenta,
+    [Style.Charconst] = color.Magenta,
+    [Style.String] = color.Magenta,
+    [Style.Operator] = "",
+    [Style.Type] = color.Green,
+    [Style.Keyword] = color.Byellow,
+    [Style.Function] = color.White,
+    [Style.Attr] = color.Blue,
+    [Style.Feature] = color.Blue,
+    [Style.Invalid] = color.Bred,
+    [Style.Comment] = color.Bcyan,
+    [Style.Warning] = color.Bred,
+    [Style.Error] = color.Bred,
+}
+
+const char*[] style_names = {
+    [Style.Normal] = "normal",
+    [Style.Identifier] = "identifier",
+    [Style.Integer] = "integer",
+    [Style.Float] = "float",
+    [Style.Charconst] = "charconst",
+    [Style.String] = "string",
+    [Style.Operator] = "operator",
+    [Style.Type] = "type",
+    [Style.Keyword] = "keyword",
+    [Style.Function] = "function",
+    [Style.Attr] = "attr",
+    [Style.Feature] = "feature",
+    [Style.Invalid] = "invalid",
+    [Style.Comment] = "comment",
+    [Style.Warning] = "warning",
+    [Style.Error] = "error",
+}
 
-fn void init_colors() {
-    if (!color.useColor()) {
-        col_keyword = "";
-        col_type = "";
-        col_feature = "";
-        col_attr = "";
-        col_identifier = "";
-        col_integer = "";
-        col_float = "";
-        col_charconst = "";
-        col_string = "";
-        col_comment = "";
-        col_invalid = "";
-        col_error = "";
-        col_normal = "";
+fn bool init_colors(Style* token_style, const char** style_color) {
+    for (Kind k = Kind.min; k <= Kind.max; k++) {
+        Style style = Normal;
+        switch (k) {
+        case None:
+            style = Normal;
+            break;
+        case Identifier:
+            style = Identifier;
+            break;
+        case IntegerLiteral:
+            style = Integer;
+            break;
+        case FloatLiteral:
+            style = Float;
+            break;
+        case CharLiteral:
+            style = Charconst;
+            break;
+        case StringLiteral:
+            style = String;
+            break;
+        case LParen ... GreaterGreaterEqual:
+            style = Operator;
+            break;
+        case KW_bool ... KW_void:
+            style = Type;
+            break;
+        case KW_as ... KW_while:
+            if (k.isQualifier()) style = Type;
+            else style = Keyword;
+            break;
+        case Feat_if ... Feat_warning:
+            style = Feature;
+            break;
+        case Invalid:
+            style = Invalid;
+            break;
+        case LineComment:
+        case BlockComment:
+            style = Comment;
+            break;
+        case Eof:
+            style = Normal;
+            break;
+        case Warning:
+            style = Warning;
+            break;
+        case Error:
+            style = Error;
+            break;
+        }
+        token_style[k] = style;
     }
+    for (Style s = Style.min; s <= Style.max; s++) {
+        style_color[s] = color.getConfigColor(style_names[s], default_colors[s]);
+    }
+    return color.useColor();
 }
 
-fn bool is_attribute(const char* str) {
-    for (u32 i=0; i<elemsof(attr_names); i++) {
+fn bool C2cat.is_attribute(C2cat* ctx, u32 name_idx) {
+    const char* str = ctx.pool.idx2str(name_idx);
+    for (u32 i = 0; i < elemsof(attr_names); i++) {
         if (strcmp(str, attr_names[i]) == 0) return true;
     }
     return false;
@@ -122,169 +214,55 @@ fn void C2cat.update_state(C2cat* ctx, const Token* tok) {
 
 fn void C2cat.print_token(C2cat* ctx, const Token* tok) {
     string_buffer.Buf* out = ctx.out;
+    u32 pos = tok.loc - ctx.tokenizer.loc_start;    // token start offset
+    u32 tok_len = tok.len;                          // token length in bytes
 
-    if (ctx.offset != 0) {
+    if (pos < ctx.offset) {
+        // token starts before end of previous token, this is an error
+        // TODO: output an error message to stderr?
+        out.add1('\n');
+        out.color(ctx.style_color[Style.Error]);
+        out.print("error: offset=%d pos=%d", ctx.offset, pos);
+        out.color(ctx.style_color[Style.Normal]);
+        out.add1('\n');
+        ctx.offset = pos;
+    }
+    if (pos > ctx.offset) {
         // copy stuff from file to out (from end of last token to start of current)
-        if (tok.done) return;
-        if (ctx.offset <= tok.loc) {
-            u32 len = tok.loc - ctx.offset;
-            if (len) out.add2(ctx.input + ctx.offset, len);
-        } else {
-            out.add1('\n');
-            out.color(col_error);
-            out.print("error: offset=%d tok.loc=%d", ctx.offset, tok.loc);
-            out.color(col_normal);
-            out.add1('\n');
-        }
+        // TODO: check for whitespace only
+        out.add2(ctx.input + ctx.offset, pos - ctx.offset);
+        ctx.offset = pos;
     }
 
-    if (tok.kind >= Kind.LParen && tok.kind < Kind.KW_bool) {
-        const char* str = tok.kind.str();
-        out.add(str);
-        ctx.offset = tok.loc + (u32)strlen(str);
-        return;
-    }
-    if (tok.kind.isBuiltinType()) {
-        const char* str = tok.kind.str();
-        out.color(col_type);
-        out.add(str);
-        out.color(col_normal);
-        ctx.offset = tok.loc + (u32)strlen(str);
-        return;
-    }
-    if (tok.kind.isQualifier()) {
-        const char* str = tok.kind.str();
-        out.color(col_type);
-        out.add(str);
-        out.color(col_normal);
-        ctx.offset = tok.loc + (u32)strlen(str);
-        return;
-    }
-    if (tok.kind.isKeyword()) {
-        const char* str = tok.kind.str();
-        out.color(col_keyword);
-        out.add(str);
-        out.color(col_normal);
-        ctx.offset = tok.loc + (u32)strlen(str);
-        return;
+    Style s = ctx.token_style[tok.kind];
+    if (tok.kind == Kind.Identifier) {
+        if (ctx.in_attributes && ctx.is_attribute(tok.name_idx)) {
+            s = Style.Attr;
+        } else
+        if (ctx.input[ctx.offset + tok_len] == '(') {
+            s = Style.Function;
+        }
     }
-    if (tok.kind >= Kind.Feat_if && tok.kind <= Kind.Feat_endif) {
-        const char* str = tok.kind.str();
-        out.color(col_feature);
-        out.add(str);
-        out.color(col_normal);
-        ctx.offset = tok.loc + (u32)strlen(str);
-        return;
+    if (s) {
+        out.color(ctx.style_color[s]);
     }
-    switch (tok.kind) {
-    case Identifier:
-        const char* str = ctx.pool.idx2str(tok.name_idx);
+    out.add2(ctx.input + ctx.offset, tok_len);
 
-        if (ctx.in_attributes && is_attribute(str)) {
-            out.color(col_attr);
-            out.add(str);
-            out.color(col_normal);
-        } else {
-            out.color(col_identifier);
-            out.add(str);
-            out.color(col_normal);
-        }
-        ctx.offset = tok.loc + (u32)strlen(str);
-        return;
-    case IntegerLiteral:
-        out.color(col_integer);
-        char[64] tmp;
-        i32 len;
-        switch (tok.getRadix()) {
-        case Hex:
-            len = sprintf(tmp, "0x%x", tok.int_value);
-            break;
-        default:
-            len = sprintf(tmp, "%d", tok.int_value);
-            break;
-        }
-        out.add(tmp);
-        ctx.offset = tok.loc + len;
-        break;
-    case FloatLiteral:
-        out.color(col_float);
-        char[64] tmp;
-        i32 len;
-        switch (tok.getRadix()) {
-        case Hex:
-            len = sprintf(tmp, "%a", tok.float_value);
-            break;
-        default:
-            len = sprintf(tmp, "%#.16g", tok.float_value);
-            break;
-        }
-        out.add(tmp);
-        ctx.offset = tok.loc + len;
-        break;
-    case CharLiteral:
-        out.color(col_charconst);
-        char[64] tmp;
-        i32 len = 0;
-        switch (tok.getRadix()) {
-        case Hex:
-            len = sprintf(tmp, "'\\x%02x'", tok.char_value);
-            break;
-        case Octal:
-            len = sprintf(tmp, "'\\%o'", tok.char_value);
-            break;
-        default:
-            if (ctype.isprint(tok.char_value)) {
-                len = sprintf(tmp, "'%c'", tok.char_value);
-            } else {
-                tmp[0] = 0;
-                // TODO print nicely (eg \n etc)
-            }
-            break;
-        }
-        out.add(tmp);
-        ctx.offset = tok.loc + len;
-        break;
-    case StringLiteral:
-        out.color(col_string);
-        out.add1('"');
-        u32 len = out.encodeBytes(ctx.pool.idx2str(tok.text_idx), tok.text_len, '"');
-        out.add1('"');
-        ctx.offset = tok.loc + len + 2;
-        break;
-    case LineComment:
-        out.color(col_comment);
-        const char* str = ctx.pool.idx2str(tok.text_idx);
-        out.print("//%s", str);
-        ctx.offset = tok.loc + (u32)strlen(str) + 2;
-        break;
-    case BlockComment:
-        out.color(col_comment);
-        const char* str = ctx.pool.idx2str(tok.text_idx);
-        out.print("/*%s*/", str);
-        ctx.offset = tok.loc + (u32)strlen(str) + 4;
-        break;
-    case Invalid:
-        out.color(col_invalid);
-        out.print("%s", tok.invalid);
-        ctx.offset = tok.loc + (u32)strlen(tok.invalid);
-        break;
-    case Error:
-        out.add1('\n');
-        out.color(col_error);
-        out.print("error: %s", ctx.tokenizer.error_msg);
-        out.color(col_normal);
-        out.add1('\n');
-        break;
-    default:
-        out.color(col_error);
-        out.print("token %s\n", tok.kind.str());
-        ctx.offset = tok.loc + 1;
-        break;
+    if (s && *ctx.style_color[s]) {
+        out.color(ctx.style_color[Style.Normal]);
     }
-    out.color(col_normal);
+    ctx.offset += tok_len;
 }
 
-public fn i32 c2cat(const char* filename)
+fn void C2cat.on_tokenizer_error(void* arg, SrcLoc loc) {
+    C2cat* ctx = arg;
+    ctx.has_error = true;
+}
+
+fn i32 c2cat(const char* filename,
+             bool use_color,
+             Style* token_style,
+             const char** style_color)
 {
     file_utils.File file.init("", filename);
     if (!file.load()) {
@@ -292,51 +270,62 @@ public fn i32 c2cat(const char* filename)
         return -1;
     }
 
-    C2cat ctx = { }
-    ctx.pool = string_pool.create(16*1024, 1024);
-    ctx.out = string_buffer.create(16*1024, true, 2);
-    ctx.offset = 0;
-    ctx.input = file.data();
-    ctx.in_attributes = 0;
+    string_pool.Pool* pool = string_pool.create(16*1024, 1024);
+    string_buffer.Buf* buf = string_buffer.create(1024, false, 0);
+    const char* input = file.data();
     u32 file_size = file.data_size();
+    keywords.Info kwinfo.init(pool);
+    string_list.List features.init(pool);
+    string_buffer.Buf* out = string_buffer.create(16*1024, use_color, 2);
 
-    string_list.List features;
-    features.init(ctx.pool);
-    string_buffer.Buf* buf = string_buffer.create(1024, false, 0);
-    keywords.Info kwinfo;
-    kwinfo.init(ctx.pool);
-    c2_tokenizer.Tokenizer tokenizer;
-    tokenizer.init(ctx.pool, buf, ctx.input, 0, &kwinfo, &features, nil, nil, nil, true);
+    C2cat ctx = {
+        .pool = pool,
+        .input = input,
+        .offset = 0,
+        .length = file_size,
+        .out = out,
+        .in_attributes = 0,
+        .token_style = token_style,
+        .style_color = style_color,
+    }
+
+    c2_tokenizer.Tokenizer tokenizer.init(pool, buf, input, 1, &kwinfo, &features,
+                                          C2cat.on_tokenizer_error, C2cat.on_tokenizer_error, &ctx, true);
     ctx.tokenizer = &tokenizer;
 
-    Token tok;
-    tok.init();
+    Token tok.init();
 
     while (!tok.done) {
         tokenizer.lex(&tok);
-        //printf("%4d %s\n", tok.loc, tok.kind.str());
-
+        if (ctx.has_error) {
+            tok.kind = Error;
+            ctx.has_error = false;
+        }
         ctx.update_state(&tok);
-
         ctx.print_token(&tok);
     }
 
-    if (ctx.offset <= file_size) {
-        u32 len = file_size - ctx.offset;
-        if (len) ctx.out.add2(ctx.input + ctx.offset, len);
+    if (ctx.offset <= ctx.length) {
+        // TODO: EOF token should have ctx.offset == ctx.length
+        u32 len = ctx.length - ctx.offset;
+        if (len) {
+            out.color(style_color[Style.Error]);
+            out.add2(ctx.input + ctx.offset, len);
+            out.color(style_color[Style.Normal]);
+        }
     } else {
-        ctx.out.add1('\n');
-        ctx.out.color(col_error);
-        ctx.out.print("error: offset=%d file_size=%d", ctx.offset, file_size);
-        ctx.out.color(col_normal);
-        ctx.out.add1('\n');
+        out.add1('\n');
+        out.color(style_color[Style.Error]);
+        out.print("error: offset=%d file.size=%d", ctx.offset, ctx.length);
+        out.color(style_color[Style.Normal]);
+        out.add1('\n');
     }
-    fputs(ctx.out.data(), stdout);
+    fputs(out.data(), stdout);
     fflush(stdout);
 
-    ctx.pool.free();
-    ctx.out.free();
+    out.free();
     buf.free();
+    pool.free();
     file.close();
 
     return 0;
@@ -344,12 +333,42 @@ public fn i32 c2cat(const char* filename)
 
 public fn i32 main(i32 argc, const char** argv)
 {
-    if (argc == 1) usage(argv[0]);
-    init_colors();
+    Style[elemsof(Kind)] token_style;
+    const char*[elemsof(Style)] style_color;
+    // TODO: use custom colors
+    bool use_color = init_colors(token_style, style_color);
+    i32 filenum = 0;
+    i32 nfiles = 0;
+    for (i32 i = 1; i < argc; i++) {
+        nfiles += (*argv[i] != '-');
+    }
     for (i32 i = 1; i < argc; i++) {
-        if (argc > 2)
-            printf("==> %s <==\n", argv[i]);
-        c2cat(argv[i]);
+        const char* arg = argv[i];
+        if (*arg == '-') {
+            switch (arg) {
+            case "--color":
+                use_color = true;
+                break;
+            case "--nocolor":
+                use_color = false;
+                break;
+            case "-?":
+            case "-h":
+            case "--help":
+                usage(argv[0]);
+                break;
+            default:
+                fprintf(stderr, "c2cat: unknown option %s\n", arg);
+                exit(EXIT_FAILURE);
+            }
+        } else {
+            if (nfiles > 1) {
+                if (filenum++) printf("\n");
+                printf("==> %s <==\n", arg);
+            }
+            c2cat(arg, use_color, token_style, style_color);
+        }
     }
+    if (!nfiles) usage(argv[0]);
     return 0;
 }