nano_gpt.tokenizer
Tokenizer for GPT-2.
This is a thin wrapper around the tiktoken library, allowing for easy unit testing.
1"""Tokenizer for GPT-2. 2 3This is a thin wrapper around the tiktoken library, allowing for easy 4unit testing. 5""" 6 7from abc import ABC, abstractmethod 8from collections.abc import Sequence 9 10import tiktoken 11 12__all__ = [ 13 "Tokenizer", 14 "get_tokenizer", 15] 16 17 18class Tokenizer(ABC): 19 """Abstract base class for tokenizers. 20 21 This is a thin wrapper around tokenizer libraries and supports encode 22 and decode. 23 """ 24 25 @abstractmethod 26 def encode(self, text: str) -> list[int]: 27 """Encode the text into tokens.""" 28 29 @abstractmethod 30 def decode(self, tokens: Sequence[int]) -> str: 31 """Decode the tokens into text.""" 32 33 34class TiktokenTokenizer(Tokenizer): 35 """Tokenizer for GPT-2 using tiktoken.""" 36 37 def __init__(self, encoding_name: str) -> None: 38 """Initialize the tokenizer.""" 39 self.encoding = tiktoken.get_encoding(encoding_name) 40 41 def encode(self, text: str) -> list[int]: 42 """Encode the text into tokens.""" 43 return self.encoding.encode(text) 44 45 def decode(self, tokens: Sequence[int]) -> str: 46 """Decode the tokens into text.""" 47 return self.encoding.decode(tokens) 48 49 50class DocumentTokenizer(Tokenizer): 51 """Tokenizer for GPT-2 using tiktoken that encodes documents.""" 52 53 def __init__(self, encoding_name: str) -> None: 54 """Initialize the tokenizer.""" 55 self.encoding = tiktoken.get_encoding(encoding_name) 56 self._eot = self.encoding._special_tokens["<|endoftext|>"] 57 58 def encode(self, text: str) -> list[int]: 59 """Encode the text into tokens.""" 60 tokens = [self._eot] 61 # Ignore any special tokesn in the document 62 tokens.extend(self.encoding.encode_ordinary(text)) 63 return tokens 64 65 def decode(self, tokens: Sequence[int]) -> str: 66 """Decode the tokens into text.""" 67 raise NotImplementedError("DocumentTokenizer does not support decoding") 68 69 70def get_tokenizer() -> Tokenizer: 71 """Get the GPT-2 tokenizer.""" 72 return TiktokenTokenizer("gpt2") 73 74 75def get_document_tokenizer() -> Tokenizer: 76 """Get the GPT-2 tokenizer for encoding documents for pre-training (only).""" 77 return DocumentTokenizer("gpt2")
class
Tokenizer(abc.ABC):
19class Tokenizer(ABC): 20 """Abstract base class for tokenizers. 21 22 This is a thin wrapper around tokenizer libraries and supports encode 23 and decode. 24 """ 25 26 @abstractmethod 27 def encode(self, text: str) -> list[int]: 28 """Encode the text into tokens.""" 29 30 @abstractmethod 31 def decode(self, tokens: Sequence[int]) -> str: 32 """Decode the tokens into text."""
Abstract base class for tokenizers.
This is a thin wrapper around tokenizer libraries and supports encode and decode.
71def get_tokenizer() -> Tokenizer: 72 """Get the GPT-2 tokenizer.""" 73 return TiktokenTokenizer("gpt2")
Get the GPT-2 tokenizer.