aboutsummaryrefslogtreecommitdiff
path: root/tests/test_streaming_lexer.py
diff options
context:
space:
mode:
Diffstat (limited to 'tests/test_streaming_lexer.py')
-rw-r--r--tests/test_streaming_lexer.py207
1 files changed, 207 insertions, 0 deletions
diff --git a/tests/test_streaming_lexer.py b/tests/test_streaming_lexer.py
new file mode 100644
index 0000000..cd03513
--- /dev/null
+++ b/tests/test_streaming_lexer.py
@@ -0,0 +1,207 @@
+import re
+from typing import Optional, Tuple
+from enum import Enum, auto
+from dataclasses import dataclass
+
+import pytest
+
+from src.gpt_chat_cli.streaming_lexer import (
+ MatchState,
+ CodeFenceContext,
+ _try_to_parse_code_fence,
+ SinglePassStreamingLexer,
+ Token,
+ TokenType,
+ TokenOrientation,
+ make_text_token
+)
+
+def test_try_to_parse_code_fence():
+ # Test valid cases
+ valid_cases = [
+ ("```python\nhe", CodeFenceContext(0, "python", 10)),
+ (" ```python\n", CodeFenceContext(2, "python", 12)),
+ ("~~~python\n", CodeFenceContext(0, "python", 10)),
+ (" ~~~python\nmore", CodeFenceContext(3, "python", 13))
+ ]
+
+
+ for case, expected in valid_cases:
+ result = _try_to_parse_code_fence(case)
+ assert result[0] == MatchState.MATCH
+ assert result[1] == expected
+
+ # Test invalid cases
+ invalid_cases = [
+ " ```python\n",
+ "~``python\n",
+ "```python ```\n",
+ "~~~python ~~~\n",
+ ]
+
+ for case in invalid_cases:
+ print(case)
+ result = _try_to_parse_code_fence(case)
+ assert result[0] == MatchState.MISMATCH
+
+ # Test indeterminate case
+ indeterminate_cases = [
+ "```",
+ " ~~~",
+ ]
+
+ for case in indeterminate_cases:
+ result = _try_to_parse_code_fence(case)
+ assert result[0] == MatchState.INDETERMINATE
+
+def _check_exact_lexing_matches( chunk_tokens, final_tokens ):
+
+ lexer = SinglePassStreamingLexer()
+
+ for ( chunk, expected_tokens ) in chunk_tokens:
+
+ lexer.add_chunk( chunk )
+
+ n_tokens_emitted = 0
+
+ for i, token in enumerate(lexer.parse()):
+ assert i < len(expected_tokens)
+ assert expected_tokens[i] == token
+
+ n_tokens_emitted += 1
+
+ assert n_tokens_emitted == len(expected_tokens)
+
+ lexer.finish()
+
+ n_tokens_emitted = 0
+
+ for i, token in enumerate(lexer.parse()):
+ assert i < len(final_tokens)
+ print(token)
+ assert final_tokens[i] == token
+
+ n_tokens_emitted += 1
+
+ assert n_tokens_emitted == len(final_tokens)
+
+
+def test_single_pass_lexing():
+
+ cases = [
+ ( 'Some text\n', [
+ make_text_token( 'Some text\n' )
+ ] ),
+ ( 'More text\n', [
+ make_text_token( 'More text\n' )
+ ] ),
+ ( ' Indented text\n', [
+ make_text_token( ' Indented text\n' )
+ ] ),
+ ( '```python\n', [
+ Token( TokenType.CODE_FENCE, TokenOrientation.BEGIN, 'python' )
+ ] ),
+ ( 'print("Hello")\n', [
+ make_text_token( 'print("Hello")\n' )
+ ] ),
+ ( '```', [] ),
+ ]
+
+ final_tokens = [
+ Token( TokenType.CODE_FENCE, TokenOrientation.END ),
+ Token( TokenType.EOF, TokenOrientation.NONE ),
+ ]
+
+ _check_exact_lexing_matches( cases, final_tokens )
+
+ cases = [
+ ( '```java\nSome text\nMore ', [
+ Token( TokenType.CODE_FENCE, TokenOrientation.BEGIN, 'java' ),
+ make_text_token( 'Some text\n' ),
+ make_text_token( 'More ' ),
+ ] ),
+ ( ' text\n```', [
+ make_text_token( ' text\n' ),
+ ] ),
+ ( '\n', [
+ Token( TokenType.CODE_FENCE, TokenOrientation.END )
+ ]),
+ ]
+
+ final_tokens = [
+ Token( TokenType.EOF, TokenOrientation.NONE ),
+ ]
+
+ _check_exact_lexing_matches( cases, final_tokens )
+
+ cases = [
+ ( ' ```java \n Some text\n More ', [
+ Token( TokenType.CODE_FENCE, TokenOrientation.BEGIN, 'java' ),
+ make_text_token( 'Some text\n' ),
+ make_text_token( 'More ' ),
+ ] ),
+ ( ' text\n ```', [
+ make_text_token( ' text\n' ),
+ ] ),
+ ( '\n', [
+ Token( TokenType.CODE_FENCE, TokenOrientation.END )
+ ]),
+ ]
+
+ final_tokens = [
+ Token( TokenType.EOF, TokenOrientation.NONE ),
+ ]
+
+ _check_exact_lexing_matches( cases, final_tokens )
+
+ cases = [
+ ( ' ``', []),
+ ('` java \n Some text\n More ', [
+ Token( TokenType.CODE_FENCE, TokenOrientation.BEGIN, 'java' ),
+ make_text_token( 'Some text\n' ),
+ make_text_token( 'More ' ),
+ ] ),
+ ( ' text\n ```', [
+ make_text_token( ' text\n' ),
+ ] ),
+ ( '\n', [
+ Token( TokenType.CODE_FENCE, TokenOrientation.END )
+ ]),
+ ]
+
+ final_tokens = [
+ Token( TokenType.EOF, TokenOrientation.NONE ),
+ ]
+
+ _check_exact_lexing_matches( cases, final_tokens )
+
+ # Ticks preceded by characters don't initiate a code block
+ cases = [
+ ( 'tick```java\nSome text\n', [
+ make_text_token( 'tick```java\n' ),
+ make_text_token( 'Some text\n' ),
+ ] ),
+ ]
+
+ final_tokens = [
+ Token( TokenType.EOF, TokenOrientation.NONE ),
+ ]
+
+ _check_exact_lexing_matches( cases, final_tokens )
+
+ # Code blocks which are not terminated, terminate
+ # at the end of the document
+ cases = [
+ ( '```java\nSome text\n', [
+ Token( TokenType.CODE_FENCE, TokenOrientation.BEGIN, 'java' ),
+ make_text_token( 'Some text\n' ),
+ ] ),
+ ]
+
+ final_tokens = [
+ Token( TokenType.CODE_FENCE, TokenOrientation.END ),
+ Token( TokenType.EOF, TokenOrientation.NONE ),
+ ]
+
+ _check_exact_lexing_matches( cases, final_tokens )
+