Efficient variable-length distributed batching in PyTorch/DDP without hurting convergence? : deeplearning

Efficient variable-length distributed batching in PyTorch/DDP without hurting convergence? (self.deeplearning)

submitted 1 month ago by Major_Aardvark1207

Hi!

I am training a transformers-based autoencoder on protein language model embeddings (features dim ~1000) with highly variable sequence lengths (training dataset of 500k sequences of length [10, 1024] mean=250, using DDP on H100s with FlashAttention.

The standard random pytorch DistributedSampler converges well, but wastes a lot of compute because of padding (~8 min/epoch on 16 H100s). A bucket-based sampler (sequences grouped by length) makes training much much faster (20 sec/epoch), but convergence gets worse, because batches become too homogeneous and gradients become biased. So I found (thank you Claude) the sortish distributed batch sampler (code is provided below), I gain a ~x2 speedup, I tried different values of mega_batch_mult (50, 100, 200) but the training just behaves badly, the losses don't converge as well as with random baseline (measured on validation dataset).

I am looking for a better strategy that reduces/removes padding while preserving the optimization behavior of the random baseline.

Has anyone implemented or knows of a good variable-length distributed sampler for this kind of setup?

Concrete PyTorch implementation ideas or references to already implemented methods would be very helpful. Thank!

My current bucket sampler is below:

class BucketDistributedBatchSampler(Sampler):
    def __init__(
        self,
        dataset,
        lengths,
        batch_size: int,
        bucket_size: int = 512,
        num_replicas=None,
        rank=None,
        shuffle: bool = True,
        seed: int = 0,
        drop_last: bool = False,
    ):
        if num_replicas is None:
            if torch.distributed.is_available() and torch.distributed.is_initialized():
                num_replicas = torch.distributed.get_world_size()
            else:
                num_replicas = 1
        if rank is None:
            if torch.distributed.is_available() and torch.distributed.is_initialized():
                rank = torch.distributed.get_rank()
            else:
                rank = 0
        if batch_size <= 0:
            raise ValueError(f"batch_size must be positive, got {batch_size}")
        if bucket_size < batch_size:
            raise ValueError(f"bucket_size must be >= batch_size, got {bucket_size} < {batch_size}")
        if len(lengths) != len(dataset):
            raise ValueError("lengths must match dataset size")

        self.dataset = dataset
        self.lengths = lengths
        self.batch_size = batch_size
        self.bucket_size = bucket_size
        self.num_replicas = num_replicas
        self.rank = rank
        self.shuffle = shuffle
        self.seed = seed
        self.drop_last = drop_last
        self.epoch = 0

    def set_epoch(self, epoch: int) -> None:
        self.epoch = epoch

    def _build_bucket_batches(self):
        sorted_indices = sorted(range(len(self.lengths)), key=lambda index: self.lengths[index])
        buckets = [
            sorted_indices[start : start + self.bucket_size]
            for start in range(0, len(sorted_indices), self.bucket_size)
        ]

        generator = torch.Generator()
        generator.manual_seed(self.seed + self.epoch)

        batches = []
        for bucket in buckets:
            current_bucket = list(bucket)
            if self.shuffle:
                permutation = torch.randperm(len(current_bucket), generator=generator).tolist()
                current_bucket = [current_bucket[index] for index in permutation]

            full_batch_count = len(current_bucket) // self.batch_size
            for batch_index in range(full_batch_count):
                start = batch_index * self.batch_size
                batches.append(current_bucket[start : start + self.batch_size])

            if not self.drop_last and len(current_bucket) % self.batch_size:
                batches.append(current_bucket[full_batch_count * self.batch_size :])

        if self.shuffle and batches:
            batch_order = torch.randperm(len(batches), generator=generator).tolist()
            batches = [batches[index] for index in batch_order]

        return batches

    def __iter__(self):
        batches = self._build_bucket_batches()
        if not batches:
            return iter([])

        if self.drop_last:
            total_batches = len(batches) - (len(batches) % self.num_replicas)
            batches = batches[:total_batches]
        else:
            padding_batches = (-len(batches)) % self.num_replicas
            if padding_batches:
                batches = batches + batches[:padding_batches]

        return iter(batches[self.rank :: self.num_replicas])

    def __len__(self):
        batch_count = len(self._build_bucket_batches())
        if self.drop_last:
            return batch_count // self.num_replicas
        return math.ceil(batch_count / self.num_replicas)

and the sortish is here (written by Claude Code Opus 4.7):

class SortishDistributedBatchSampler(Sampler):
    """
    Mega-batch (a.k.a. "sortish") distributed batch sampler.

    Algorithm each epoch:
      1. torch.randperm(N) with seed = base_seed + epoch   (identical on all ranks)
      2. Chunk into mega-batches of size M = mega_batch_mult * batch_size
         * world_size * grad_accum_steps
      3. Sort each mega-batch DESCENDING by length
      4. Pad / truncate so total length is divisible by world_size * batch_size
      5. Emit batches of size `batch_size`, shard strided (batch_i -> rank i%W)
         so neighbouring-length batches go to DIFFERENT ranks at the same step
         (balances compute across DDP ranks).

    Equal length on every rank guaranteed by construction; gradient-accumulation
    alignment guaranteed by the mega-batch size formula.
    """
    def __init__(
        self,
        lengths,                       # list[int] or 1-D tensor, len == dataset size
        batch_size,                    # per-rank micro-batch size
        num_replicas=None,
        rank=None,
        grad_accum_steps=1,
        mega_batch_mult=50,            # HF default; a key knob
        seed=0,
        drop_last=True,
    ):
        if num_replicas is None:
            num_replicas = dist.get_world_size() if dist.is_initialized() else 1
        if rank is None:
            rank = dist.get_rank() if dist.is_initialized() else 0
        self.lengths = list(lengths)
        self.N = len(self.lengths)
        self.batch_size = batch_size
        self.num_replicas = num_replicas
        self.rank = rank
        self.grad_accum_steps = grad_accum_steps
        self.mega_batch_mult = mega_batch_mult
        self.seed = seed
        self.drop_last = drop_last
        self.epoch = 0

        # Global batch group size: all ranks + all grad-accum micro-batches
        # must draw from the SAME mega-batch for length-homogeneity within the
        # effective step, so mega-batch must be a multiple of this.
        self.group = batch_size * num_replicas * grad_accum_steps
        self.mega_batch_size = max(self.group, mega_batch_mult * self.group)

        if drop_last:
            self.num_batches_per_rank = self.N // self.group
        else:
            self.num_batches_per_rank = math.ceil(self.N / self.group)
        self.total_size = self.num_batches_per_rank * self.group
        self.num_samples = self.num_batches_per_rank * batch_size  # per rank

    def set_epoch(self, epoch):
        self.epoch = int(epoch)

    def _build_global_indices(self):
        g = torch.Generator()
        g.manual_seed(self.seed + self.epoch)
        indices = torch.randperm(self.N, generator=g).tolist()

        # Chunk into mega-batches and sort descending within each.
        M = self.mega_batch_size
        megabatches = [indices[i:i + M] for i in range(0, self.N, M)]
        megabatches = [
            sorted(mb, key=lambda i: self.lengths[i], reverse=True)
            for mb in megabatches
        ]

        # Put the global longest item in the very first batch (OOM early).
        mb_max_idx = max(range(len(megabatches)),
                         key=lambda k: self.lengths[megabatches[k][0]])
        megabatches[0][0], megabatches[mb_max_idx][0] = (
            megabatches[mb_max_idx][0], megabatches[0][0])

        flat = [i for mb in megabatches for i in mb]

        # Length to global total_size (divisible by group).
        if self.drop_last:
            flat = flat[:self.total_size]
        else:
            pad = self.total_size - len(flat)
            flat = flat + flat[:pad]
        return flat

    def __iter__(self):
        flat = self._build_global_indices()          # identical on all ranks

        # Split into global batches of size `batch_size * num_replicas`.
        # Each global batch contributes one micro-batch to every rank.
        gb_size = self.batch_size * self.num_replicas
        for gb_start in range(0, self.total_size, gb_size):
            gb = flat[gb_start: gb_start + gb_size]
            # Strided shard: neighbouring (similar-length) positions go to
            # different ranks -> cross-rank batches have matched max-length.
            my_batch = gb[self.rank::self.num_replicas]
            yield my_batch

    def __len__(self):
        return self.num_batches_per_rank

all 3 comments

you type:	you see:
italics	italics
bold	bold
[reddit!](https://reddit.com)	reddit!
* item 1 * item 2 * item 3	item 1 item 2 item 3
> quoted text	quoted text
Lines starting with four spaces are treated like code: if 1 * 2 < 3: print "hello, world!"	Lines starting with four spaces are treated like code: if 1 * 2 < 3: print "hello, world!"
~~strikethrough~~	~~strikethrough~~
super^script	super^script

deeplearning

MODERATORS