Add automatic code highlighting

author: flu0r1ne <flu0r1ne@flu0r1ne.net> 2023-05-06 05:42:39 -0500
committer: flu0r1ne <flu0r1ne@flu0r1ne.net> 2023-05-06 05:42:52 -0500
commit: 603ebf9a866314b3304f800d50c09a3cd55d8546 (patch)
tree: 8409896f90999eb556c0e2d46cb9c0f7e607d43a /src/gpt_chat_cli/chat_colorizer.py
parent: 537d08fd952a88a799eff4002d8e6f1d2c224258 (diff)
download: gpt-chat-cli-603ebf9a866314b3304f800d50c09a3cd55d8546.tar.xz
gpt-chat-cli-603ebf9a866314b3304f800d50c09a3cd55d8546.zip
1 files changed, 267 insertions, 0 deletions
diff --git a/src/gpt_chat_cli/chat_colorizer.py b/src/gpt_chat_cli/chat_colorizer.py
new file mode 100644
index 0000000..168a5b8
--- /dev/null
+++ b/src/gpt_chat_cli/chat_colorizer.py
@@ -0,0 +1,267 @@
+from pygments import highlight
+from pygments.lexer import Lexer
+from pygments.formatter import Formatter
+from pygments.lexers import (
+    get_lexer_by_name, get_all_lexers, guess_lexer,
+    find_lexer_class_by_name
+)
+from pygments.formatters import (
+    TerminalFormatter,
+    NullFormatter
+)
+
+from dataclasses import dataclass
+from typing import Optional
+from pygments.util import ClassNotFound
+
+from .streaming_lexer import (
+    SinglePassStreamingLexer,
+    Token,
+    TokenType,
+    TokenOrientation
+)
+
+from .color import (
+    get_color_codes,
+    ColorCode,
+)
+
+# Guessing languages takes time ...
+# Assume these are our candidates since
+# they likely cover upward of 90% of
+# usage
+GUESSABLE_LANGUAGES = [
+    'html',
+    'python',
+    'java',
+    'python2',
+    'c++',
+    'javascript',
+    'c#',
+    'sql',
+    'c',
+    'php',
+    'go',
+    'swift',
+    'kotlin',
+    'ruby',
+    'typescript',
+    'scala',
+    'r',
+    'rust',
+    'css',
+    'perl',
+    'make',
+    'text'
+]
+
+def guess_lexer( text : str, **options ):
+    '''
+    Guess the lexer in use from GUESSABLE_LANGUAGES.
+
+    Uses very primitive heuristics and is not very good.
+    '''
+
+    best_lexer = [0.0, None]
+
+    for lexer_name in GUESSABLE_LANGUAGES:
+
+        lexer = find_lexer_class_by_name(lexer_name)
+
+        rv = lexer.analyse_text(text)
+
+        if rv == 1.0:
+            return lexer(**options)
+        if rv > best_lexer[0]:
+            best_lexer[:] = (rv, lexer)
+
+    if not best_lexer[0] or best_lexer[1] is None:
+
+        raise ClassNotFound('no lexer matching the text found')
+
+    return best_lexer[1](**options)
+
+@dataclass
+class CodefenceContext:
+    language: str
+    lexer : Lexer
+    formatter : Formatter
+    buffer : str = ''
+    eof : bool = False
+
+    def may_guess_language( self : "CodefenceContext" ):
+
+        if self.eof:
+            return True
+
+        MIN_CHARACTERS = 150
+        MIN_LINES = 2
+
+        return (
+            len(self.buffer) > MIN_CHARACTERS and
+            self.buffer.count('\n') > MIN_LINES
+        )
+
+    def get_highlighted_lines(self : "CodefenceContext"):
+
+        if self.language is None:
+            if self.may_guess_language():
+
+                lexer = guess_lexer( self.buffer )
+
+                self.language = lexer.name
+                self.lexer = lexer
+
+            else:
+                return None
+
+
+        idx = self.buffer.rfind('\n')
+
+        if idx == -1:
+            return None
+        else:
+            lines = self.buffer[:idx+1]
+            self.buffer = self.buffer[idx+1:]
+
+            highlighted = highlight(
+                lines,
+                self.lexer,
+                self.formatter,
+            )
+
+            return highlighted
+
+class ChatColorizer( object ):
+
+    lexer : SinglePassStreamingLexer
+    formatter : Formatter
+    cf_ctx : Optional[CodefenceContext]
+    color_code : ColorCode
+    text_emitted: bool
+
+    def __init__( self : "ChatColorizer", no_color = False ):
+        self.lexer = SinglePassStreamingLexer()
+
+        self.cf_ctx = None
+
+        if no_color:
+            self.formatter = NullFormatter()
+        else:
+            self.formatter = TerminalFormatter()
+
+        self.color_code = get_color_codes( no_color=no_color )
+        self.text_emitted = False
+
+    def add_chunk( self : "ChatColorizer", chunk : str ):
+        self.lexer.add_chunk( chunk )
+
+    def print( self : "ChatColorizer" ):
+
+        for token in self.lexer.parse():
+
+            if token.type == TokenType.EOF:
+                break
+
+            if token.type == TokenType.CODE_FENCE:
+
+                if not self.text_emitted:
+                    print()
+                    self.text_emitted = True
+
+                if token.orientation == TokenOrientation.BEGIN:
+                    assert self.cf_ctx is None
+
+                    lang = token.content
+
+                    try:
+                        lexer = get_lexer_by_name(lang)
+                    except ClassNotFound:
+                        # try to guess it
+                        lang = None
+                        lexer = None
+
+                    self.cf_ctx = CodefenceContext(lang, lexer, self.formatter)
+
+                else:
+                    assert self.cf_ctx is not None
+
+                    self.cf_ctx.eof = True
+
+                    highlighted = self.cf_ctx.get_highlighted_lines()
+
+                    if highlighted:
+                        print( highlighted, end='', flush=True )
+
+                    self.cf_ctx = None
+
+                # Add extra \n to either side of a chunk
+                print(f'{self.color_code.WHITE}```{self.color_code.RESET}', flush=True)
+
+                continue
+
+            if self.cf_ctx:
+
+                self.cf_ctx.buffer += token.content
+                highlighted = self.cf_ctx.get_highlighted_lines()
+
+                if highlighted:
+                    print( highlighted, end='', flush=True )
+
+            else:
+
+                print( f'{self.color_code.WHITE}{token.content}{self.color_code.RESET}', end='', flush=True )
+                self.text_emitted = True
+
+
+    def finish( self : "ChatColorizer" ):
+        self.lexer.finish()
+
+
+# content = '''
+# Rust code:
+
+# ```rust
+# fn main() {
+#     let x = 5;
+#     let y = 10;
+#     let z = x + y;
+#     println!("The value of z is {}", z);
+# }
+# ```
+
+# Python code:
+
+# ```python
+# x = 5
+# y = 10
+# z = x + y
+# print("The value of z is", z)
+# ```
+
+# Unknown code:
+
+# ```
+# x = 5
+# y = 10
+# z = x + y
+# print("The value of z is", z)
+# ```
+
+
+# Testing
+
+# ```python
+# x = 5
+# y = 10
+# z = x + y
+# print("The value of z is", z)
+
+# '''
+
+# highlighter = ChatColorizer()
+
+# highlighter.add_chunk(content)
+# highlighter.finish()
+
+# highlighter.print()
author	flu0r1ne <flu0r1ne@flu0r1ne.net>	2023-05-06 05:42:39 -0500
committer	flu0r1ne <flu0r1ne@flu0r1ne.net>	2023-05-06 05:42:52 -0500
commit	603ebf9a866314b3304f800d50c09a3cd55d8546 (patch)
tree	8409896f90999eb556c0e2d46cb9c0f7e607d43a /src/gpt_chat_cli/chat_colorizer.py
parent	537d08fd952a88a799eff4002d8e6f1d2c224258 (diff)
download	gpt-chat-cli-603ebf9a866314b3304f800d50c09a3cd55d8546.tar.xz gpt-chat-cli-603ebf9a866314b3304f800d50c09a3cd55d8546.zip