mlx-examples/esm/benchmarks/benchmark_pt.py

import time

import torch
from transformers import AutoTokenizer, EsmForMaskedLM

# Example protein sequence (Green Fluorescent Protein)
protein_sequence = "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"

# Hugging Face model identifier for ESM-2 (33 layers, 650M params, UR50D training set)
model_name = "facebook/esm2_t33_650M_UR50D"

# Load tokenizer and model; move model to Apple Metal Performance Shaders (MPS) device
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = EsmForMaskedLM.from_pretrained(model_name).to("mps")

# Number of sequences per forward pass
batch_size = 5

# Number of timing iterations
steps = 50

# Tokenize input sequence and replicate for the batch
# Replicate the same sequence 'batch_size' times to create a batch
inputs = tokenizer(
    [protein_sequence] * batch_size,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=1024,
)
input_ids = inputs["input_ids"].to("mps")
attention_mask = inputs["attention_mask"].to("mps")

# Warm-up phase
for _ in range(10):
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    torch.mps.synchronize()  # Ensure all queued ops on MPS are complete before next step

# Timed inference loop
tic = time.time()
for _ in range(steps):
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    torch.mps.synchronize()  # Wait for computation to finish before timing next iteration
toc = time.time()

# Compute performance metrics
ms_per_step = 1000 * (toc - tic) / steps
throughput = batch_size * 1000 / ms_per_step

# Report results
print(f"Time (ms) per step: {ms_per_step:.3f}")
print(f"Throughput: {throughput:.2f} sequences/sec")