diff --git a/t5/t5.py b/t5/t5.py index a166d4c4..8f344d11 100644 --- a/t5/t5.py +++ b/t5/t5.py @@ -210,7 +210,7 @@ class TransformerDecoderLayer(nn.Module): x = x + y y = self.ln2(x) - y, _ = self.cross_attention(x, memory, memory, memory_mask) + y, _ = self.cross_attention(y, memory, memory, memory_mask) x = x + y y = self.ln3(x)