Skip to content

ezpz.data.visionΒΆ

ezpz/data/vision.py

Sam Foreman 2024-12-27

HFImageNet1K ΒΆ

Bases: Dataset

Thin wrapper to use HF imagenet-1k with torchvision transforms.

Source code in src/ezpz/data/vision.py
class HFImageNet1K(Dataset):
    """Thin wrapper to use HF imagenet-1k with torchvision transforms."""

    def __init__(self, hf_dataset, transform=None):
        self.ds = hf_dataset
        self.transform = transform

    def __len__(self) -> int:
        return len(self.ds)

    def __getitem__(self, idx: int):
        example = self.ds[int(idx)]
        img = example["image"]  # PIL.Image or array
        label = int(example["label"])
        if self.transform is not None:
            img = self.transform(img)
        return img, label

get_imagenet(train_batch_size=128, test_batch_size=128, outdir=None, num_workers=1, shuffle=False, pin_memory=True) ΒΆ

Return train/test ImageNet datasets, loaders, and (optional) samplers.

Expects directory layout:

1
2
3
4
5
6
7
8
9
<outdir>/data/imagenet/
    train/
        class1/
        class2/
        ...
    val/
        class1/
        class2/
        ...

where train/ and val/ are standard ImageNet-style folders.

Source code in src/ezpz/data/vision.py
def get_imagenet(
    train_batch_size: int = 128,
    test_batch_size: int = 128,
    outdir: Optional[str | Path] = None,
    num_workers: int = 1,
    shuffle: bool = False,
    pin_memory: bool = True,
) -> dict:
    """Return train/test ImageNet datasets, loaders, and (optional) samplers.

    Expects directory layout:

        <outdir>/data/imagenet/
            train/
                class1/
                class2/
                ...
            val/
                class1/
                class2/
                ...

    where `train/` and `val/` are standard ImageNet-style folders.
    """
    outdir = OUTPUT_DIR if outdir is None else Path(outdir)
    datadir = Path(outdir).joinpath("data", "imagenet")

    # Standard ImageNet normalization
    normalize = transforms.Normalize(
        mean=(0.485, 0.456, 0.406),
        std=(0.229, 0.224, 0.225),
    )

    train_transform = transforms.Compose(
        [
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]
    )

    test_transform = transforms.Compose(
        [
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ]
    )

    # Basic sanity check only on rank 0 (no auto-download for ImageNet)
    if ezpz.dist.get_rank() == 0:
        train_dir = datadir / "train"
        val_dir = datadir / "val"
        if not train_dir.is_dir() or not val_dir.is_dir():
            raise FileNotFoundError(
                f"Expected ImageNet data under:\n"
                f"  {train_dir}\n"
                f"  {val_dir}\n"
                "with standard ImageFolder layout."
            )

    if ezpz.dist.get_world_size() > 1:
        ezpz.dist.barrier()

    dataset1 = datasets.ImageFolder(
        root=datadir / "train", transform=train_transform
    )
    dataset2 = datasets.ImageFolder(
        root=datadir / "val", transform=test_transform
    )

    train_kwargs: dict = {
        "batch_size": train_batch_size,
        "pin_memory": pin_memory,
        "num_workers": num_workers,
    }
    test_kwargs: dict = {
        "batch_size": test_batch_size,
        "pin_memory": pin_memory,
        "num_workers": num_workers,
    }

    sampler1, sampler2 = None, None
    rank = ezpz.dist.get_rank()
    world_size = ezpz.dist.get_world_size()

    if world_size > 1:
        sampler1 = DistributedSampler(
            dataset1,
            rank=rank,
            num_replicas=world_size,
            shuffle=True,
        )
        sampler2 = DistributedSampler(
            dataset2,
            rank=rank,
            num_replicas=world_size,
            shuffle=False,
        )
        train_kwargs["sampler"] = sampler1
        test_kwargs["sampler"] = sampler2
    else:
        train_kwargs["shuffle"] = shuffle

    loader_train = torch.utils.data.DataLoader(
        dataset=dataset1, **train_kwargs
    )
    loader_test = torch.utils.data.DataLoader(dataset=dataset2, **test_kwargs)

    return {
        "train": {
            "data": dataset1,
            "loader": loader_train,
            "sampler": sampler1,
        },
        "test": {
            "data": dataset2,
            "loader": loader_test,
            "sampler": sampler2,
        },
    }

get_imagenet1k(train_batch_size=128, test_batch_size=128, outdir=None, num_workers=1, download=True, shuffle=False, pin_memory=True) ΒΆ

ILSVRC/imagenet-1k via Hugging Face, mirroring get_mnist API/behavior.

Source code in src/ezpz/data/vision.py
def get_imagenet1k(
    train_batch_size: int = 128,
    test_batch_size: int = 128,
    outdir: Optional[str | Path] = None,
    num_workers: int = 1,
    download: bool = True,
    shuffle: bool = False,
    pin_memory: bool = True,
) -> dict:
    """ILSVRC/imagenet-1k via Hugging Face, mirroring get_mnist API/behavior."""

    try:
        from datasets import load_dataset
    except ImportError as exc:
        raise ImportError(
            "The `datasets` library is required for get_imagenet_hf.\n"
            "Install via `pip install datasets`."
        ) from exc

    outdir = OUTPUT_DIR if outdir is None else Path(outdir)
    datadir = Path(outdir).joinpath("data", "imagenet_hf")
    datadir.mkdir(parents=True, exist_ok=True)

    # Optional "don't download" behavior
    if not download and not any(datadir.iterdir()):
        raise FileNotFoundError(
            f"No cached imagenet-1k dataset found in {datadir} and download=False."
        )

    rank = ezpz.dist.get_rank()
    world_size = ezpz.dist.get_world_size()

    # Only rank 0 triggers the initial download into cache_dir
    if rank == 0 and download:
        _ = load_dataset(
            "ILSVRC/imagenet-1k",
            split="train",
            cache_dir=datadir.as_posix(),
        )
        _ = load_dataset(
            "ILSVRC/imagenet-1k",
            split="validation",
            cache_dir=datadir.as_posix(),
        )

    if world_size > 1:
        ezpz.dist.barrier()

    # Now every rank loads from the shared cache_dir
    hf_train = load_dataset(
        "ILSVRC/imagenet-1k",
        split="train",
        cache_dir=datadir.as_posix(),
    )
    hf_val = load_dataset(
        "ILSVRC/imagenet-1k",
        split="validation",
        cache_dir=datadir.as_posix(),
    )

    # ImageNet-style normalization and transforms
    normalize = transforms.Normalize(
        mean=(0.485, 0.456, 0.406),
        std=(0.229, 0.224, 0.225),
    )

    train_transform = transforms.Compose(
        [
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]
    )

    test_transform = transforms.Compose(
        [
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ]
    )

    dataset1 = HFImageNet1K(hf_train, transform=train_transform)
    dataset2 = HFImageNet1K(hf_val, transform=test_transform)

    train_kwargs: dict = {
        "batch_size": train_batch_size,
        "pin_memory": pin_memory,
        "num_workers": num_workers,
    }
    test_kwargs: dict = {
        "batch_size": test_batch_size,
        "pin_memory": pin_memory,
        "num_workers": num_workers,
    }

    sampler1, sampler2 = None, None
    if world_size > 1:
        sampler1 = DistributedSampler(
            dataset1,
            rank=rank,
            num_replicas=world_size,
            shuffle=True,
        )
        sampler2 = DistributedSampler(
            dataset2,
            rank=rank,
            num_replicas=world_size,
            shuffle=False,
        )
        train_kwargs["sampler"] = sampler1
        test_kwargs["sampler"] = sampler2
    else:
        train_kwargs["shuffle"] = shuffle

    train_loader = torch.utils.data.DataLoader(
        dataset=dataset1,
        **train_kwargs,
    )
    test_loader = torch.utils.data.DataLoader(
        dataset=dataset2,
        **test_kwargs,
    )

    return {
        "train": {
            "data": dataset1,
            "loader": train_loader,
            "sampler": sampler1,
        },
        "test": {
            "data": dataset2,
            "loader": test_loader,
            "sampler": sampler2,
        },
    }

get_openimages(train_batch_size=128, test_batch_size=128, outdir=None, num_workers=1, download=False, shuffle=False, pin_memory=True) ΒΆ

Return train/test OpenImages datasets, loaders, and samplers.

Expects an ImageFolder-style layout:

1
2
3
4
5
6
7
8
9
<outdir>/data/openimages/
    train/
        class_000/
        class_001/
        ...
    val/
        class_000/
        class_001/
        ...

download is a no-op here; you need to stage the data yourself.

Source code in src/ezpz/data/vision.py
def get_openimages(
    train_batch_size: int = 128,
    test_batch_size: int = 128,
    outdir: Optional[str | Path] = None,
    num_workers: int = 1,
    download: bool = False,  # kept for API parity; not used
    shuffle: bool = False,
    pin_memory: bool = True,
) -> dict:
    """Return train/test OpenImages datasets, loaders, and samplers.

    Expects an ImageFolder-style layout:

        <outdir>/data/openimages/
            train/
                class_000/
                class_001/
                ...
            val/
                class_000/
                class_001/
                ...

    `download` is a no-op here; you need to stage the data yourself.
    """
    outdir = OUTPUT_DIR if outdir is None else Path(outdir)
    datadir = Path(outdir).joinpath("data", "openimages")

    train_dir = datadir / "train"
    val_dir = datadir / "val"

    # Sanity check (only on rank 0)
    if ezpz.dist.get_rank() == 0:
        if not train_dir.is_dir() or not val_dir.is_dir():
            from ezpz.data.utils import download_openimages_subset

            download_openimages_subset(
                outdir=datadir,
                split="train",
                max_classes=50,
                num_workers=num_workers,
            )
            # raise FileNotFoundError(
            #     f"Expected OpenImages data under:\n"
            #     f"  {train_dir}\n"
            #     f"  {val_dir}\n"
            #     "with standard ImageFolder layout."
            # )
    if ezpz.dist.get_world_size() > 1:
        ezpz.dist.barrier()

    # Use standard ImageNet/OpenImages-like normalization
    normalize = transforms.Normalize(
        mean=(0.485, 0.456, 0.406),
        std=(0.229, 0.224, 0.225),
    )

    train_transform = transforms.Compose(
        [
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]
    )

    test_transform = transforms.Compose(
        [
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ]
    )

    # Datasets
    dataset1 = datasets.ImageFolder(
        root=train_dir,
        transform=train_transform,
    )
    dataset2 = datasets.ImageFolder(
        root=val_dir,
        transform=test_transform,
    )

    train_kwargs: dict = {
        "batch_size": train_batch_size,
        "pin_memory": pin_memory,
        "num_workers": num_workers,
    }
    test_kwargs: dict = {
        "batch_size": test_batch_size,
        "pin_memory": pin_memory,
        "num_workers": num_workers,
    }

    sampler1, sampler2 = None, None
    rank = ezpz.dist.get_rank()
    world_size = ezpz.dist.get_world_size()

    if world_size > 1:
        sampler1 = DistributedSampler(
            dataset1,
            rank=rank,
            num_replicas=world_size,
            shuffle=True,
        )
        sampler2 = DistributedSampler(
            dataset2,
            rank=rank,
            num_replicas=world_size,
            shuffle=False,
        )
        train_kwargs["sampler"] = sampler1
        test_kwargs["sampler"] = sampler2
    else:
        train_kwargs["shuffle"] = shuffle

    train_loader = torch.utils.data.DataLoader(
        dataset=dataset1,
        **train_kwargs,
    )
    test_loader = torch.utils.data.DataLoader(
        dataset=dataset2,
        **test_kwargs,
    )

    return {
        "train": {
            "data": dataset1,
            "loader": train_loader,
            "sampler": sampler1,
        },
        "test": {
            "data": dataset2,
            "loader": test_loader,
            "sampler": sampler2,
        },
    }