From af5a2996234768921b81d96ffaae00cb88229862 Mon Sep 17 00:00:00 2001 From: flu0r1ne Date: Wed, 1 Nov 2023 20:46:01 -0500 Subject: Initial commit --- llama/tokenizer.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 llama/tokenizer.py (limited to 'llama/tokenizer.py') diff --git a/llama/tokenizer.py b/llama/tokenizer.py new file mode 100644 index 0000000..937a0b8 --- /dev/null +++ b/llama/tokenizer.py @@ -0,0 +1,89 @@ +""" +Llama Tokenizer +=============== +This module contains the Tokenizer class that wraps the SentencePiece tokenizer. +""" + +from typing import List +from sentencepiece import SentencePieceProcessor # type: ignore + +class Tokenizer: + """ + Llama Tokenizer Class + --------------------- + This class provides a wrapper around the SentencePiece tokenizer. + It adds some utility functions for easier encoding and decoding. + + Attributes: + bos_id (int): The id representing the "beginning of sentence" token. + eos_id (int): The id representing the "end of sentence" token. + pad_id (int): The id representing the padding token. + vocab_size (int): The size of the vocabulary. + """ + + def __init__(self, model_path: str): + """ + Initialize the Tokenizer. + + Args: + model_path (str): The path to the SentencePiece model file. + + Returns: + None + """ + sp = SentencePieceProcessor(model_file=model_path) + + self.bos_id: int = sp.bos_id() + self.eos_id: int = sp.eos_id() + self.pad_id: int = sp.pad_id() + self.vocab_size: int = sp.vocab_size() + + self.sp = sp + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + """ + Encode a string as a sequence of token IDs. + + Args: + s (str): The string to be encoded. + bos (bool, optional): Whether to add a "beginning of sentence" token. Defaults to False. + eos (bool, optional): Whether to add an "end of sentence" token. Defaults to False. + + Returns: + List[int]: The list of token IDs. + """ + tokens = [] + + if bos: + tokens.append(self.bos_id) + + tokens.extend(self.sp.encode(s)) + + if eos: + tokens.append(self.eos_id) + + return tokens + + def decode(self, tokens: List[int]) -> str: + """ + Decode a sequence of token IDs to a string. + + Args: + tokens (List[int]): The list of token IDs to be decoded. + + Returns: + str: The decoded string. + """ + return self.sp.decode(tokens) + + def id_to_piece(self, token: int) -> str: + """ + Convert a token ID to its corresponding token string. + + Args: + token (int): The token ID. + + Returns: + str: The token string, with SentencePiece's '▁' character replaced by a space. + """ + return self.sp.id_to_piece(token).replace('▁', ' ') -- cgit v1.2.3