nano_gpt.tokenizer

Tokenizer for GPT-2.

This is a thin wrapper around the tiktoken library, allowing for easy unit testing.

 1"""Tokenizer for GPT-2.
 2
 3This is a thin wrapper around the tiktoken library, allowing for easy
 4unit testing.
 5"""
 6
 7from abc import ABC, abstractmethod
 8from collections.abc import Sequence
 9
10import tiktoken
11
12__all__ = [
13    "Tokenizer",
14    "get_tokenizer",
15]
16
17
18class Tokenizer(ABC):
19    """Abstract base class for tokenizers.
20
21    This is a thin wrapper around tokenizer libraries and supports encode
22    and decode.
23    """
24
25    @abstractmethod
26    def encode(self, text: str) -> list[int]:
27        """Encode the text into tokens."""
28
29    @abstractmethod
30    def decode(self, tokens: Sequence[int]) -> str:
31        """Decode the tokens into text."""
32
33
34class TiktokenTokenizer(Tokenizer):
35    """Tokenizer for GPT-2 using tiktoken."""
36
37    def __init__(self, encoding_name: str) -> None:
38        """Initialize the tokenizer."""
39        self.encoding = tiktoken.get_encoding(encoding_name)
40
41    def encode(self, text: str) -> list[int]:
42        """Encode the text into tokens."""
43        return self.encoding.encode(text)
44
45    def decode(self, tokens: Sequence[int]) -> str:
46        """Decode the tokens into text."""
47        return self.encoding.decode(tokens)
48
49
50class DocumentTokenizer(Tokenizer):
51    """Tokenizer for GPT-2 using tiktoken that encodes documents."""
52
53    def __init__(self, encoding_name: str) -> None:
54        """Initialize the tokenizer."""
55        self.encoding = tiktoken.get_encoding(encoding_name)
56        self._eot = self.encoding._special_tokens["<|endoftext|>"]
57
58    def encode(self, text: str) -> list[int]:
59        """Encode the text into tokens."""
60        tokens = [self._eot]
61        # Ignore any special tokesn in the document
62        tokens.extend(self.encoding.encode_ordinary(text))
63        return tokens
64
65    def decode(self, tokens: Sequence[int]) -> str:
66        """Decode the tokens into text."""
67        raise NotImplementedError("DocumentTokenizer does not support decoding")
68
69
70def get_tokenizer() -> Tokenizer:
71    """Get the GPT-2 tokenizer."""
72    return TiktokenTokenizer("gpt2")
73
74
75def get_document_tokenizer() -> Tokenizer:
76    """Get the GPT-2 tokenizer for encoding documents for pre-training (only)."""
77    return DocumentTokenizer("gpt2")
class Tokenizer(abc.ABC):
19class Tokenizer(ABC):
20    """Abstract base class for tokenizers.
21
22    This is a thin wrapper around tokenizer libraries and supports encode
23    and decode.
24    """
25
26    @abstractmethod
27    def encode(self, text: str) -> list[int]:
28        """Encode the text into tokens."""
29
30    @abstractmethod
31    def decode(self, tokens: Sequence[int]) -> str:
32        """Decode the tokens into text."""

Abstract base class for tokenizers.

This is a thin wrapper around tokenizer libraries and supports encode and decode.

@abstractmethod
def encode(self, text: str) -> list[int]:
26    @abstractmethod
27    def encode(self, text: str) -> list[int]:
28        """Encode the text into tokens."""

Encode the text into tokens.

@abstractmethod
def decode(self, tokens: Sequence[int]) -> str:
30    @abstractmethod
31    def decode(self, tokens: Sequence[int]) -> str:
32        """Decode the tokens into text."""

Decode the tokens into text.

def get_tokenizer() -> Tokenizer:
71def get_tokenizer() -> Tokenizer:
72    """Get the GPT-2 tokenizer."""
73    return TiktokenTokenizer("gpt2")

Get the GPT-2 tokenizer.