nano_gpt.datasets

Datasets for training and evaluating the model.

 1"""Datasets for training and evaluating the model."""
 2
 3from . import finewebedu, tinyshakespeare
 4
 5__all__ = [
 6    "TRAIN_DATASETS",
 7    "finewebedu",
 8    "tinyshakespeare",
 9    "hellaswag",
10]
11
12
13TRAIN_DATASETS_LIST = [
14    finewebedu.DATASET,
15    tinyshakespeare.DATASET,
16]
17TRAIN_DATASETS = {dataset.name: dataset for dataset in TRAIN_DATASETS_LIST}
TRAIN_DATASETS = {'finewebedu': TrainDataset(name='finewebedu', load_fn=<function load_dataset>, total_tokens=10000000000, tokens_per_shard=100000000), 'tinyshakespeare': TrainDataset(name='tinyshakespeare', load_fn=<function load_dataset>, total_tokens=301967, tokens_per_shard=400000)}