1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
|
from pygments import highlight
from pygments.lexer import Lexer
from pygments.formatter import Formatter
from pygments.lexers import (
get_lexer_by_name, get_all_lexers, guess_lexer,
find_lexer_class_by_name
)
from pygments.formatters import (
TerminalFormatter,
NullFormatter
)
from dataclasses import dataclass
from typing import Optional
from pygments.util import ClassNotFound
from .streaming_lexer import (
SinglePassStreamingLexer,
Token,
TokenType,
TokenOrientation
)
from .color import (
get_color_codes,
ColorCode,
)
# Guessing languages takes time ...
# Assume these are our candidates since
# they likely cover upward of 90% of
# usage
GUESSABLE_LANGUAGES = [
'html',
'python',
'java',
'python2',
'c++',
'javascript',
'c#',
'sql',
'c',
'php',
'go',
'swift',
'kotlin',
'ruby',
'typescript',
'scala',
'r',
'rust',
'css',
'perl',
'make',
'text'
]
def guess_lexer( text : str, **options ):
'''
Guess the lexer in use from GUESSABLE_LANGUAGES.
Uses very primitive heuristics and is not very good.
'''
best_lexer = [0.0, None]
for lexer_name in GUESSABLE_LANGUAGES:
lexer = find_lexer_class_by_name(lexer_name)
rv = lexer.analyse_text(text)
if rv == 1.0:
return lexer(**options)
if rv > best_lexer[0]:
best_lexer[:] = (rv, lexer)
if not best_lexer[0] or best_lexer[1] is None:
raise ClassNotFound('no lexer matching the text found')
return best_lexer[1](**options)
@dataclass
class CodefenceContext:
language: str
lexer : Lexer
formatter : Formatter
buffer : str = ''
eof : bool = False
def may_guess_language( self : "CodefenceContext" ):
if self.eof:
return True
MIN_CHARACTERS = 150
MIN_LINES = 2
return (
len(self.buffer) > MIN_CHARACTERS and
self.buffer.count('\n') > MIN_LINES
)
def get_highlighted_lines(self : "CodefenceContext"):
if self.language is None:
if self.may_guess_language():
lexer = guess_lexer( self.buffer )
self.language = lexer.name
self.lexer = lexer
else:
return None
idx = self.buffer.rfind('\n')
if idx == -1:
return None
else:
lines = self.buffer[:idx+1]
self.buffer = self.buffer[idx+1:]
highlighted = highlight(
lines,
self.lexer,
self.formatter,
)
return highlighted
class ChatColorizer( object ):
lexer : SinglePassStreamingLexer
formatter : Formatter
cf_ctx : Optional[CodefenceContext]
color_code : ColorCode
text_emitted: bool
def __init__( self : "ChatColorizer", no_color = False ):
self.lexer = SinglePassStreamingLexer()
self.cf_ctx = None
if no_color:
self.formatter = NullFormatter()
else:
self.formatter = TerminalFormatter()
self.color_code = get_color_codes( no_color=no_color )
self.text_emitted = False
def add_chunk( self : "ChatColorizer", chunk : str ):
self.lexer.add_chunk( chunk )
def print( self : "ChatColorizer" ):
for token in self.lexer.parse():
if token.type == TokenType.EOF:
break
if token.type == TokenType.CODE_FENCE:
if not self.text_emitted:
print()
self.text_emitted = True
if token.orientation == TokenOrientation.BEGIN:
assert self.cf_ctx is None
lang = token.content
try:
lexer = get_lexer_by_name(lang)
except ClassNotFound:
# try to guess it
lang = None
lexer = None
self.cf_ctx = CodefenceContext(lang, lexer, self.formatter)
else:
assert self.cf_ctx is not None
self.cf_ctx.eof = True
highlighted = self.cf_ctx.get_highlighted_lines()
if highlighted:
print( highlighted, end='', flush=True )
self.cf_ctx = None
# Add extra \n to either side of a chunk
print(f'{self.color_code.WHITE}```{self.color_code.RESET}', flush=True)
continue
if self.cf_ctx:
self.cf_ctx.buffer += token.content
highlighted = self.cf_ctx.get_highlighted_lines()
if highlighted:
print( highlighted, end='', flush=True )
else:
print( f'{self.color_code.WHITE}{token.content}{self.color_code.RESET}', end='', flush=True )
self.text_emitted = True
def finish( self : "ChatColorizer" ):
self.lexer.finish()
|