nano_gpt.datasets.finewebedu
Data loader library for the finewebedu 10B dataset.
This is a thin wrapper around the HuggingFace datasets library that handles sharding the dataset.
See https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu
1"""Data loader library for the finewebedu 10B dataset. 2 3This is a thin wrapper around the HuggingFace datasets library that 4handles sharding the dataset. 5 6See https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu 7""" 8 9import logging 10 11import datasets 12 13from nano_gpt.config import TrainDataset 14 15 16_LOGGER = logging.getLogger(__name__) 17 18__all__ = [ 19 "DATASET", 20 "load_dataset", 21] 22 23 24# This dataset only has a train split so we create a validation split 25# by taking the last 10% of the training data. 26_SPLITS = { 27 "train": "train[:97%]", 28 "validation": "train[97%:]", 29} 30 31 32def load_dataset(split: str, streaming: bool = True) -> datasets.Dataset: 33 """Load the dataset.""" 34 if split not in _SPLITS: 35 raise ValueError( 36 f"Invalid split: {split}. Must be one of {list(_SPLITS.keys())}." 37 ) 38 return datasets.load_dataset( 39 "HuggingFaceFW/fineweb-edu", 40 name="sample-10BT", 41 streaming=streaming, 42 split=_SPLITS[split], 43 ) 44 45 46DATASET = TrainDataset( 47 name="finewebedu", 48 load_fn=load_dataset, 49 total_tokens=int(10e9), # 10B tokens, 50 tokens_per_shard=int(100e6), # 100M tokens 51)
DATASET =
TrainDataset(name='finewebedu', load_fn=<function load_dataset>, total_tokens=10000000000, tokens_per_shard=100000000)
def
load_dataset(split: str, streaming: bool = True) -> datasets.arrow_dataset.Dataset:
33def load_dataset(split: str, streaming: bool = True) -> datasets.Dataset: 34 """Load the dataset.""" 35 if split not in _SPLITS: 36 raise ValueError( 37 f"Invalid split: {split}. Must be one of {list(_SPLITS.keys())}." 38 ) 39 return datasets.load_dataset( 40 "HuggingFaceFW/fineweb-edu", 41 name="sample-10BT", 42 streaming=streaming, 43 split=_SPLITS[split], 44 )
Load the dataset.