import math import mlx.core as mx import mlx.nn as nn class LoRALinear(nn.Module): @staticmethod def from_linear(linear: nn.Linear, rank: int = 8): # TODO remove when input_dims and output_dims are attributes # on linear and quantized linear output_dims, input_dims = linear.weight.shape if isinstance(linear, nn.QuantizedLinear): input_dims *= 32 // linear.bits lora_lin = LoRALinear(input_dims, output_dims, rank) lora_lin.linear = linear return lora_lin def to_linear(self): linear = self.linear bias = "bias" in linear weight = linear.weight is_quantized = isinstance(linear, nn.QuantizedLinear) # Use the same type as the linear weight if not quantized dtype = weight.dtype if is_quantized: dtype = mx.float16 weight = mx.dequantize( weight, linear.scales, linear.biases, linear.group_size, linear.bits, ) output_dims, input_dims = weight.shape fused_linear = nn.Linear(input_dims, output_dims, bias=bias) lora_b = (self.scale * self.lora_b.T).astype(dtype) lora_a = self.lora_a.T.astype(dtype) fused_linear.weight = weight + lora_b @ lora_a if bias: fused_linear.bias = linear.bias if is_quantized: fused_linear = nn.QuantizedLinear.from_linear( fused_linear, linear.group_size, linear.bits, ) return fused_linear def __init__( self, input_dims: int, output_dims: int, lora_rank: int = 8, bias: bool = False, scale: float = 20.0, ): super().__init__() # Regular linear layer weights self.linear = nn.Linear(input_dims, output_dims, bias=bias) # Scale for low-rank update self.scale = scale # Low rank lora weights scale = 1 / math.sqrt(input_dims) self.lora_a = mx.random.uniform( low=-scale, high=scale, shape=(input_dims, lora_rank), ) self.lora_b = mx.zeros(shape=(lora_rank, output_dims)) def __call__(self, x): dtype = self.linear.weight.dtype if isinstance(self.linear, nn.QuantizedLinear): dtype = self.linear.scales.dtype y = self.linear(x.astype(dtype)) z = (x @ self.lora_a) @ self.lora_b return y + self.scale * z