From 603ebf9a866314b3304f800d50c09a3cd55d8546 Mon Sep 17 00:00:00 2001 From: flu0r1ne Date: Sat, 6 May 2023 05:42:39 -0500 Subject: Add automatic code highlighting --- tests/test_streaming_lexer.py | 207 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 tests/test_streaming_lexer.py (limited to 'tests') diff --git a/tests/test_streaming_lexer.py b/tests/test_streaming_lexer.py new file mode 100644 index 0000000..cd03513 --- /dev/null +++ b/tests/test_streaming_lexer.py @@ -0,0 +1,207 @@ +import re +from typing import Optional, Tuple +from enum import Enum, auto +from dataclasses import dataclass + +import pytest + +from src.gpt_chat_cli.streaming_lexer import ( + MatchState, + CodeFenceContext, + _try_to_parse_code_fence, + SinglePassStreamingLexer, + Token, + TokenType, + TokenOrientation, + make_text_token +) + +def test_try_to_parse_code_fence(): + # Test valid cases + valid_cases = [ + ("```python\nhe", CodeFenceContext(0, "python", 10)), + (" ```python\n", CodeFenceContext(2, "python", 12)), + ("~~~python\n", CodeFenceContext(0, "python", 10)), + (" ~~~python\nmore", CodeFenceContext(3, "python", 13)) + ] + + + for case, expected in valid_cases: + result = _try_to_parse_code_fence(case) + assert result[0] == MatchState.MATCH + assert result[1] == expected + + # Test invalid cases + invalid_cases = [ + " ```python\n", + "~``python\n", + "```python ```\n", + "~~~python ~~~\n", + ] + + for case in invalid_cases: + print(case) + result = _try_to_parse_code_fence(case) + assert result[0] == MatchState.MISMATCH + + # Test indeterminate case + indeterminate_cases = [ + "```", + " ~~~", + ] + + for case in indeterminate_cases: + result = _try_to_parse_code_fence(case) + assert result[0] == MatchState.INDETERMINATE + +def _check_exact_lexing_matches( chunk_tokens, final_tokens ): + + lexer = SinglePassStreamingLexer() + + for ( chunk, expected_tokens ) in chunk_tokens: + + lexer.add_chunk( chunk ) + + n_tokens_emitted = 0 + + for i, token in enumerate(lexer.parse()): + assert i < len(expected_tokens) + assert expected_tokens[i] == token + + n_tokens_emitted += 1 + + assert n_tokens_emitted == len(expected_tokens) + + lexer.finish() + + n_tokens_emitted = 0 + + for i, token in enumerate(lexer.parse()): + assert i < len(final_tokens) + print(token) + assert final_tokens[i] == token + + n_tokens_emitted += 1 + + assert n_tokens_emitted == len(final_tokens) + + +def test_single_pass_lexing(): + + cases = [ + ( 'Some text\n', [ + make_text_token( 'Some text\n' ) + ] ), + ( 'More text\n', [ + make_text_token( 'More text\n' ) + ] ), + ( ' Indented text\n', [ + make_text_token( ' Indented text\n' ) + ] ), + ( '```python\n', [ + Token( TokenType.CODE_FENCE, TokenOrientation.BEGIN, 'python' ) + ] ), + ( 'print("Hello")\n', [ + make_text_token( 'print("Hello")\n' ) + ] ), + ( '```', [] ), + ] + + final_tokens = [ + Token( TokenType.CODE_FENCE, TokenOrientation.END ), + Token( TokenType.EOF, TokenOrientation.NONE ), + ] + + _check_exact_lexing_matches( cases, final_tokens ) + + cases = [ + ( '```java\nSome text\nMore ', [ + Token( TokenType.CODE_FENCE, TokenOrientation.BEGIN, 'java' ), + make_text_token( 'Some text\n' ), + make_text_token( 'More ' ), + ] ), + ( ' text\n```', [ + make_text_token( ' text\n' ), + ] ), + ( '\n', [ + Token( TokenType.CODE_FENCE, TokenOrientation.END ) + ]), + ] + + final_tokens = [ + Token( TokenType.EOF, TokenOrientation.NONE ), + ] + + _check_exact_lexing_matches( cases, final_tokens ) + + cases = [ + ( ' ```java \n Some text\n More ', [ + Token( TokenType.CODE_FENCE, TokenOrientation.BEGIN, 'java' ), + make_text_token( 'Some text\n' ), + make_text_token( 'More ' ), + ] ), + ( ' text\n ```', [ + make_text_token( ' text\n' ), + ] ), + ( '\n', [ + Token( TokenType.CODE_FENCE, TokenOrientation.END ) + ]), + ] + + final_tokens = [ + Token( TokenType.EOF, TokenOrientation.NONE ), + ] + + _check_exact_lexing_matches( cases, final_tokens ) + + cases = [ + ( ' ``', []), + ('` java \n Some text\n More ', [ + Token( TokenType.CODE_FENCE, TokenOrientation.BEGIN, 'java' ), + make_text_token( 'Some text\n' ), + make_text_token( 'More ' ), + ] ), + ( ' text\n ```', [ + make_text_token( ' text\n' ), + ] ), + ( '\n', [ + Token( TokenType.CODE_FENCE, TokenOrientation.END ) + ]), + ] + + final_tokens = [ + Token( TokenType.EOF, TokenOrientation.NONE ), + ] + + _check_exact_lexing_matches( cases, final_tokens ) + + # Ticks preceded by characters don't initiate a code block + cases = [ + ( 'tick```java\nSome text\n', [ + make_text_token( 'tick```java\n' ), + make_text_token( 'Some text\n' ), + ] ), + ] + + final_tokens = [ + Token( TokenType.EOF, TokenOrientation.NONE ), + ] + + _check_exact_lexing_matches( cases, final_tokens ) + + # Code blocks which are not terminated, terminate + # at the end of the document + cases = [ + ( '```java\nSome text\n', [ + Token( TokenType.CODE_FENCE, TokenOrientation.BEGIN, 'java' ), + make_text_token( 'Some text\n' ), + ] ), + ] + + final_tokens = [ + Token( TokenType.CODE_FENCE, TokenOrientation.END ), + Token( TokenType.EOF, TokenOrientation.NONE ), + ] + + _check_exact_lexing_matches( cases, final_tokens ) + -- cgit v1.2.3