nano_gpt.datasets.tinyshakespeare
Data loader library for the tinyshakespeare dataset.
This is a thin wrapper around the HuggingFace datasets library.
1"""Data loader library for the tinyshakespeare dataset. 2 3This is a thin wrapper around the HuggingFace datasets library. 4""" 5 6import datasets 7 8from nano_gpt.config import TrainDataset 9 10 11__all__ = [ 12 "DATASET", 13 "load_dataset", 14] 15 16 17def load_dataset(split: str, streaming: bool = True) -> datasets.Dataset: 18 """Load the dataset. 19 20 Streaming flag is ignored because the tinyshakespeare dataset is small. 21 """ 22 return datasets.load_dataset( 23 "tiny_shakespeare", trust_remote_code=True, split=split 24 ) 25 26 27DATASET = TrainDataset( 28 name="tinyshakespeare", 29 load_fn=load_dataset, 30 total_tokens=301967, # Approximately 300k tokens 31 # Seta limit higher than the dataset size. The entire dataset is a 32 # single record so it must fit in 1 shard. 33 tokens_per_shard=400000, 34)
DATASET =
TrainDataset(name='tinyshakespeare', load_fn=<function load_dataset>, total_tokens=301967, tokens_per_shard=400000)
def
load_dataset(split: str, streaming: bool = True) -> datasets.arrow_dataset.Dataset:
18def load_dataset(split: str, streaming: bool = True) -> datasets.Dataset: 19 """Load the dataset. 20 21 Streaming flag is ignored because the tinyshakespeare dataset is small. 22 """ 23 return datasets.load_dataset( 24 "tiny_shakespeare", trust_remote_code=True, split=split 25 )
Load the dataset.
Streaming flag is ignored because the tinyshakespeare dataset is small.