src/gpt_chat_cli/chat_colorizer.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218

from pygments import highlight
from pygments.lexer import Lexer
from pygments.formatter import Formatter
from pygments.lexers import (
    get_lexer_by_name, get_all_lexers, guess_lexer,
    find_lexer_class_by_name
)
from pygments.formatters import (
    TerminalFormatter,
    NullFormatter
)

from dataclasses import dataclass
from typing import Optional
from pygments.util import ClassNotFound

from .streaming_lexer import (
    SinglePassStreamingLexer,
    Token,
    TokenType,
    TokenOrientation
)

from .color import (
    get_color_codes,
    ColorCode,
)

# Guessing languages takes time ...
# Assume these are our candidates since
# they likely cover upward of 90% of
# usage
GUESSABLE_LANGUAGES = [
    'html',
    'python',
    'java',
    'python2',
    'c++',
    'javascript',
    'c#',
    'sql',
    'c',
    'php',
    'go',
    'swift',
    'kotlin',
    'ruby',
    'typescript',
    'scala',
    'r',
    'rust',
    'css',
    'perl',
    'make',
    'text'
]

def guess_lexer( text : str, **options ):
    '''
    Guess the lexer in use from GUESSABLE_LANGUAGES.

    Uses very primitive heuristics and is not very good.
    '''

    best_lexer = [0.0, None]

    for lexer_name in GUESSABLE_LANGUAGES:

        lexer = find_lexer_class_by_name(lexer_name)

        rv = lexer.analyse_text(text)

        if rv == 1.0:
            return lexer(**options)
        if rv > best_lexer[0]:
            best_lexer[:] = (rv, lexer)

    if not best_lexer[0] or best_lexer[1] is None:

        raise ClassNotFound('no lexer matching the text found')

    return best_lexer[1](**options)

@dataclass
class CodefenceContext:
    language: str
    lexer : Lexer
    formatter : Formatter
    buffer : str = ''
    eof : bool = False

    def may_guess_language( self : "CodefenceContext" ):

        if self.eof:
            return True

        MIN_CHARACTERS = 150
        MIN_LINES = 2

        return (
            len(self.buffer) > MIN_CHARACTERS and
            self.buffer.count('\n') > MIN_LINES
        )

    def get_highlighted_lines(self : "CodefenceContext"):

        if self.language is None:
            if self.may_guess_language():

                lexer = guess_lexer( self.buffer )

                self.language = lexer.name
                self.lexer = lexer

            else:
                return None


        idx = self.buffer.rfind('\n')

        if idx == -1:
            return None
        else:
            lines = self.buffer[:idx+1]
            self.buffer = self.buffer[idx+1:]

            highlighted = highlight(
                lines,
                self.lexer,
                self.formatter,
            )

            return highlighted

class ChatColorizer( object ):

    lexer : SinglePassStreamingLexer
    formatter : Formatter
    cf_ctx : Optional[CodefenceContext]
    color_code : ColorCode
    text_emitted: bool

    def __init__( self : "ChatColorizer", no_color = False ):
        self.lexer = SinglePassStreamingLexer()

        self.cf_ctx = None

        if no_color:
            self.formatter = NullFormatter()
        else:
            self.formatter = TerminalFormatter()

        self.color_code = get_color_codes( no_color=no_color )
        self.text_emitted = False

    def add_chunk( self : "ChatColorizer", chunk : str ):
        self.lexer.add_chunk( chunk )

    def print( self : "ChatColorizer" ):

        for token in self.lexer.parse():

            if token.type == TokenType.EOF:
                break

            if token.type == TokenType.CODE_FENCE:

                if not self.text_emitted:
                    print()
                    self.text_emitted = True

                if token.orientation == TokenOrientation.BEGIN:
                    assert self.cf_ctx is None

                    lang = token.content

                    try:
                        lexer = get_lexer_by_name(lang)
                    except ClassNotFound:
                        # try to guess it
                        lang = None
                        lexer = None

                    self.cf_ctx = CodefenceContext(lang, lexer, self.formatter)

                else:
                    assert self.cf_ctx is not None

                    self.cf_ctx.eof = True

                    highlighted = self.cf_ctx.get_highlighted_lines()

                    if highlighted:
                        print( highlighted, end='', flush=True )

                    self.cf_ctx = None

                # Add extra \n to either side of a chunk
                print(f'{self.color_code.WHITE}```{self.color_code.RESET}', flush=True)

                continue

            if self.cf_ctx:

                self.cf_ctx.buffer += token.content
                highlighted = self.cf_ctx.get_highlighted_lines()

                if highlighted:
                    print( highlighted, end='', flush=True )

            else:

                print( f'{self.color_code.WHITE}{token.content}{self.color_code.RESET}', end='', flush=True )
                self.text_emitted = True


    def finish( self : "ChatColorizer" ):
        self.lexer.finish()