nano_gpt.datasets.tinyshakespeare

Data loader library for the tinyshakespeare dataset.

This is a thin wrapper around the HuggingFace datasets library.

 1"""Data loader library for the tinyshakespeare dataset.
 2
 3This is a thin wrapper around the HuggingFace datasets library.
 4"""
 5
 6import datasets
 7
 8from nano_gpt.config import TrainDataset
 9
10
11__all__ = [
12    "DATASET",
13    "load_dataset",
14]
15
16
17def load_dataset(split: str, streaming: bool = True) -> datasets.Dataset:
18    """Load the dataset.
19
20    Streaming flag is ignored because the tinyshakespeare dataset is small.
21    """
22    return datasets.load_dataset(
23        "tiny_shakespeare", trust_remote_code=True, split=split
24    )
25
26
27DATASET = TrainDataset(
28    name="tinyshakespeare",
29    load_fn=load_dataset,
30    total_tokens=301967,  # Approximately 300k tokens
31    # Seta limit higher than the dataset size. The entire dataset is a
32    # single record so it must fit in 1 shard.
33    tokens_per_shard=400000,
34)
DATASET = TrainDataset(name='tinyshakespeare', load_fn=<function load_dataset>, total_tokens=301967, tokens_per_shard=400000)
def load_dataset(split: str, streaming: bool = True) -> datasets.arrow_dataset.Dataset:
18def load_dataset(split: str, streaming: bool = True) -> datasets.Dataset:
19    """Load the dataset.
20
21    Streaming flag is ignored because the tinyshakespeare dataset is small.
22    """
23    return datasets.load_dataset(
24        "tiny_shakespeare", trust_remote_code=True, split=split
25    )

Load the dataset.

Streaming flag is ignored because the tinyshakespeare dataset is small.