nano_gpt.datasets.finewebedu

Data loader library for the finewebedu 10B dataset.

This is a thin wrapper around the HuggingFace datasets library that handles sharding the dataset.

See https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu

 1"""Data loader library for the finewebedu 10B dataset.
 2
 3This is a thin wrapper around the HuggingFace datasets library that
 4handles sharding the dataset.
 5
 6See https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu
 7"""
 8
 9import logging
10
11import datasets
12
13from nano_gpt.config import TrainDataset
14
15
16_LOGGER = logging.getLogger(__name__)
17
18__all__ = [
19    "DATASET",
20    "load_dataset",
21]
22
23
24# This dataset only has a train split so we create a validation split
25# by taking the last 10% of the training data.
26_SPLITS = {
27    "train": "train[:97%]",
28    "validation": "train[97%:]",
29}
30
31
32def load_dataset(split: str, streaming: bool = True) -> datasets.Dataset:
33    """Load the dataset."""
34    if split not in _SPLITS:
35        raise ValueError(
36            f"Invalid split: {split}. Must be one of {list(_SPLITS.keys())}."
37        )
38    return datasets.load_dataset(
39        "HuggingFaceFW/fineweb-edu",
40        name="sample-10BT",
41        streaming=streaming,
42        split=_SPLITS[split],
43    )
44
45
46DATASET = TrainDataset(
47    name="finewebedu",
48    load_fn=load_dataset,
49    total_tokens=int(10e9),  # 10B tokens,
50    tokens_per_shard=int(100e6),  # 100M tokens
51)
DATASET = TrainDataset(name='finewebedu', load_fn=<function load_dataset>, total_tokens=10000000000, tokens_per_shard=100000000)
def load_dataset(split: str, streaming: bool = True) -> datasets.arrow_dataset.Dataset:
33def load_dataset(split: str, streaming: bool = True) -> datasets.Dataset:
34    """Load the dataset."""
35    if split not in _SPLITS:
36        raise ValueError(
37            f"Invalid split: {split}. Must be one of {list(_SPLITS.keys())}."
38        )
39    return datasets.load_dataset(
40        "HuggingFaceFW/fineweb-edu",
41        name="sample-10BT",
42        streaming=streaming,
43        split=_SPLITS[split],
44    )

Load the dataset.