Skip to content

ezpz.data.llamaΒΆ

ezpz/data/llama.py

LlamaDataLoader ΒΆ

Source code in src/ezpz/data/llama.py
class LlamaDataLoader:
    def __init__(
        self,
        dataset_repo: str,
        tokenizer_name: str = 'hf-internal-testing/llama-tokenizer',
        max_length: int = 512,
        batch_size: int = 8,
        shuffle: bool = True,
        num_workers: int = 2,
        split: str = 'train',
    ):
        """
        Initializes the LlamaDataLoader.

        Args:
            dataset_repo (str): Hugging Face dataset repository path.
            tokenizer_name (str): Name or path of the LLaMA tokenizer.
            max_length (int): Maximum sequence length for tokenization.
            batch_size (int): Batch size for the DataLoader.
            shuffle (bool): Whether to shuffle the dataset.
            num_workers (int): Number of workers for data loading.
            split (str): Dataset split to load (e.g., "train", "validation").
        """
        self.dataset_repo = dataset_repo
        self.tokenizer_name = tokenizer_name
        self.max_length = max_length
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.num_workers = num_workers
        self.split = split

        # Load dataset and tokenizer
        self.dataset = self._load_dataset()
        self.tokenizer = self._load_tokenizer()

    def _load_dataset(self):
        """Load the dataset from Hugging Face."""
        return load_dataset(self.dataset_repo, split=self.split)

    def _load_tokenizer(self):
        """Load the LLaMA tokenizer."""
        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = (
                tokenizer.eos_token
            )  # Set pad token to eos token

        return tokenizer

    def _tokenize_function(self, examples: dict[str, Any]) -> dict[str, Any]:
        """
        Tokenizes the input examples.

        Args:
            examples (Dict[str, Any]): A batch of examples from the dataset.

        Returns:
            Dict[str, Any]: Tokenized examples with input_ids and attention_mask.
        """
        return self.tokenizer(
            examples[
                'text'
            ],  # Replace "text" with the appropriate key in your dataset
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt',
        )

    def get_data_loader(self) -> DataLoader:
        """
        Creates and returns a PyTorch DataLoader.

        Returns:
            DataLoader: A PyTorch DataLoader for the tokenized dataset.
        """
        # Tokenize the dataset
        tokenized_dataset = self.dataset.map(
            self._tokenize_function,
            batched=True,
            remove_columns=['text'],  # Remove non-tokenized columns
        )

        # Convert to PyTorch format
        tokenized_dataset.set_format(  # type:ignore
            type='torch', columns=['input_ids', 'attention_mask']
        )

        # Create DataLoader
        data_loader = DataLoader(
            tokenized_dataset,  # type:ignore
            batch_size=self.batch_size,
            shuffle=self.shuffle,
            num_workers=self.num_workers,
        )

        return data_loader

__init__(dataset_repo, tokenizer_name='hf-internal-testing/llama-tokenizer', max_length=512, batch_size=8, shuffle=True, num_workers=2, split='train') ΒΆ

Initializes the LlamaDataLoader.

Parameters:

Name Type Description Default
dataset_repo str

Hugging Face dataset repository path.

required
tokenizer_name str

Name or path of the LLaMA tokenizer.

'hf-internal-testing/llama-tokenizer'
max_length int

Maximum sequence length for tokenization.

512
batch_size int

Batch size for the DataLoader.

8
shuffle bool

Whether to shuffle the dataset.

True
num_workers int

Number of workers for data loading.

2
split str

Dataset split to load (e.g., "train", "validation").

'train'
Source code in src/ezpz/data/llama.py
def __init__(
    self,
    dataset_repo: str,
    tokenizer_name: str = 'hf-internal-testing/llama-tokenizer',
    max_length: int = 512,
    batch_size: int = 8,
    shuffle: bool = True,
    num_workers: int = 2,
    split: str = 'train',
):
    """
    Initializes the LlamaDataLoader.

    Args:
        dataset_repo (str): Hugging Face dataset repository path.
        tokenizer_name (str): Name or path of the LLaMA tokenizer.
        max_length (int): Maximum sequence length for tokenization.
        batch_size (int): Batch size for the DataLoader.
        shuffle (bool): Whether to shuffle the dataset.
        num_workers (int): Number of workers for data loading.
        split (str): Dataset split to load (e.g., "train", "validation").
    """
    self.dataset_repo = dataset_repo
    self.tokenizer_name = tokenizer_name
    self.max_length = max_length
    self.batch_size = batch_size
    self.shuffle = shuffle
    self.num_workers = num_workers
    self.split = split

    # Load dataset and tokenizer
    self.dataset = self._load_dataset()
    self.tokenizer = self._load_tokenizer()

get_data_loader() ΒΆ

Creates and returns a PyTorch DataLoader.

Returns:

Name Type Description
DataLoader DataLoader

A PyTorch DataLoader for the tokenized dataset.

Source code in src/ezpz/data/llama.py
def get_data_loader(self) -> DataLoader:
    """
    Creates and returns a PyTorch DataLoader.

    Returns:
        DataLoader: A PyTorch DataLoader for the tokenized dataset.
    """
    # Tokenize the dataset
    tokenized_dataset = self.dataset.map(
        self._tokenize_function,
        batched=True,
        remove_columns=['text'],  # Remove non-tokenized columns
    )

    # Convert to PyTorch format
    tokenized_dataset.set_format(  # type:ignore
        type='torch', columns=['input_ids', 'attention_mask']
    )

    # Create DataLoader
    data_loader = DataLoader(
        tokenized_dataset,  # type:ignore
        batch_size=self.batch_size,
        shuffle=self.shuffle,
        num_workers=self.num_workers,
    )

    return data_loader