aboutsummaryrefslogtreecommitdiff
path: root/src/gpt_chat_cli/chat_colorizer.py
diff options
context:
space:
mode:
authorflu0r1ne <flu0r1ne@flu0r1ne.net>2023-05-06 05:42:39 -0500
committerflu0r1ne <flu0r1ne@flu0r1ne.net>2023-05-06 05:42:52 -0500
commit603ebf9a866314b3304f800d50c09a3cd55d8546 (patch)
tree8409896f90999eb556c0e2d46cb9c0f7e607d43a /src/gpt_chat_cli/chat_colorizer.py
parent537d08fd952a88a799eff4002d8e6f1d2c224258 (diff)
downloadgpt-chat-cli-603ebf9a866314b3304f800d50c09a3cd55d8546.tar.xz
gpt-chat-cli-603ebf9a866314b3304f800d50c09a3cd55d8546.zip
Add automatic code highlighting
Diffstat (limited to 'src/gpt_chat_cli/chat_colorizer.py')
-rw-r--r--src/gpt_chat_cli/chat_colorizer.py267
1 files changed, 267 insertions, 0 deletions
diff --git a/src/gpt_chat_cli/chat_colorizer.py b/src/gpt_chat_cli/chat_colorizer.py
new file mode 100644
index 0000000..168a5b8
--- /dev/null
+++ b/src/gpt_chat_cli/chat_colorizer.py
@@ -0,0 +1,267 @@
+from pygments import highlight
+from pygments.lexer import Lexer
+from pygments.formatter import Formatter
+from pygments.lexers import (
+ get_lexer_by_name, get_all_lexers, guess_lexer,
+ find_lexer_class_by_name
+)
+from pygments.formatters import (
+ TerminalFormatter,
+ NullFormatter
+)
+
+from dataclasses import dataclass
+from typing import Optional
+from pygments.util import ClassNotFound
+
+from .streaming_lexer import (
+ SinglePassStreamingLexer,
+ Token,
+ TokenType,
+ TokenOrientation
+)
+
+from .color import (
+ get_color_codes,
+ ColorCode,
+)
+
+# Guessing languages takes time ...
+# Assume these are our candidates since
+# they likely cover upward of 90% of
+# usage
+GUESSABLE_LANGUAGES = [
+ 'html',
+ 'python',
+ 'java',
+ 'python2',
+ 'c++',
+ 'javascript',
+ 'c#',
+ 'sql',
+ 'c',
+ 'php',
+ 'go',
+ 'swift',
+ 'kotlin',
+ 'ruby',
+ 'typescript',
+ 'scala',
+ 'r',
+ 'rust',
+ 'css',
+ 'perl',
+ 'make',
+ 'text'
+]
+
+def guess_lexer( text : str, **options ):
+ '''
+ Guess the lexer in use from GUESSABLE_LANGUAGES.
+
+ Uses very primitive heuristics and is not very good.
+ '''
+
+ best_lexer = [0.0, None]
+
+ for lexer_name in GUESSABLE_LANGUAGES:
+
+ lexer = find_lexer_class_by_name(lexer_name)
+
+ rv = lexer.analyse_text(text)
+
+ if rv == 1.0:
+ return lexer(**options)
+ if rv > best_lexer[0]:
+ best_lexer[:] = (rv, lexer)
+
+ if not best_lexer[0] or best_lexer[1] is None:
+
+ raise ClassNotFound('no lexer matching the text found')
+
+ return best_lexer[1](**options)
+
+@dataclass
+class CodefenceContext:
+ language: str
+ lexer : Lexer
+ formatter : Formatter
+ buffer : str = ''
+ eof : bool = False
+
+ def may_guess_language( self : "CodefenceContext" ):
+
+ if self.eof:
+ return True
+
+ MIN_CHARACTERS = 150
+ MIN_LINES = 2
+
+ return (
+ len(self.buffer) > MIN_CHARACTERS and
+ self.buffer.count('\n') > MIN_LINES
+ )
+
+ def get_highlighted_lines(self : "CodefenceContext"):
+
+ if self.language is None:
+ if self.may_guess_language():
+
+ lexer = guess_lexer( self.buffer )
+
+ self.language = lexer.name
+ self.lexer = lexer
+
+ else:
+ return None
+
+
+ idx = self.buffer.rfind('\n')
+
+ if idx == -1:
+ return None
+ else:
+ lines = self.buffer[:idx+1]
+ self.buffer = self.buffer[idx+1:]
+
+ highlighted = highlight(
+ lines,
+ self.lexer,
+ self.formatter,
+ )
+
+ return highlighted
+
+class ChatColorizer( object ):
+
+ lexer : SinglePassStreamingLexer
+ formatter : Formatter
+ cf_ctx : Optional[CodefenceContext]
+ color_code : ColorCode
+ text_emitted: bool
+
+ def __init__( self : "ChatColorizer", no_color = False ):
+ self.lexer = SinglePassStreamingLexer()
+
+ self.cf_ctx = None
+
+ if no_color:
+ self.formatter = NullFormatter()
+ else:
+ self.formatter = TerminalFormatter()
+
+ self.color_code = get_color_codes( no_color=no_color )
+ self.text_emitted = False
+
+ def add_chunk( self : "ChatColorizer", chunk : str ):
+ self.lexer.add_chunk( chunk )
+
+ def print( self : "ChatColorizer" ):
+
+ for token in self.lexer.parse():
+
+ if token.type == TokenType.EOF:
+ break
+
+ if token.type == TokenType.CODE_FENCE:
+
+ if not self.text_emitted:
+ print()
+ self.text_emitted = True
+
+ if token.orientation == TokenOrientation.BEGIN:
+ assert self.cf_ctx is None
+
+ lang = token.content
+
+ try:
+ lexer = get_lexer_by_name(lang)
+ except ClassNotFound:
+ # try to guess it
+ lang = None
+ lexer = None
+
+ self.cf_ctx = CodefenceContext(lang, lexer, self.formatter)
+
+ else:
+ assert self.cf_ctx is not None
+
+ self.cf_ctx.eof = True
+
+ highlighted = self.cf_ctx.get_highlighted_lines()
+
+ if highlighted:
+ print( highlighted, end='', flush=True )
+
+ self.cf_ctx = None
+
+ # Add extra \n to either side of a chunk
+ print(f'{self.color_code.WHITE}```{self.color_code.RESET}', flush=True)
+
+ continue
+
+ if self.cf_ctx:
+
+ self.cf_ctx.buffer += token.content
+ highlighted = self.cf_ctx.get_highlighted_lines()
+
+ if highlighted:
+ print( highlighted, end='', flush=True )
+
+ else:
+
+ print( f'{self.color_code.WHITE}{token.content}{self.color_code.RESET}', end='', flush=True )
+ self.text_emitted = True
+
+
+ def finish( self : "ChatColorizer" ):
+ self.lexer.finish()
+
+
+# content = '''
+# Rust code:
+
+# ```rust
+# fn main() {
+# let x = 5;
+# let y = 10;
+# let z = x + y;
+# println!("The value of z is {}", z);
+# }
+# ```
+
+# Python code:
+
+# ```python
+# x = 5
+# y = 10
+# z = x + y
+# print("The value of z is", z)
+# ```
+
+# Unknown code:
+
+# ```
+# x = 5
+# y = 10
+# z = x + y
+# print("The value of z is", z)
+# ```
+
+
+# Testing
+
+# ```python
+# x = 5
+# y = 10
+# z = x + y
+# print("The value of z is", z)
+
+# '''
+
+# highlighter = ChatColorizer()
+
+# highlighter.add_chunk(content)
+# highlighter.finish()
+
+# highlighter.print()