TuKoResearch
/

AuriStream-base

@@ -253,7 +253,8 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
     AuriStream speech language model.
     A GPT-like transformer model for cochlear token prediction with optional
-    multi-token prediction (MTP) heads for speculative decoding.
     Developed by Greta Tuckute and Klemen Kotar.
     """
@@ -302,6 +303,7 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
         self,
         input_ids: Optional[torch.LongTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = True,
         # Legacy arguments for compatibility
@@ -314,13 +316,16 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
         Args:
             input_ids: Input token IDs of shape (batch_size, seq_len)
             labels: Target token IDs for computing loss
             output_hidden_states: Whether to return all hidden states
             return_dict: Whether to return a dict or tuple
             seq: Legacy argument (alias for input_ids)
             tgt: Legacy argument (alias for labels)
         Returns:
-            CausalLMOutput with logits and optional loss
         """
         # Handle legacy arguments
         if seq is not None:
@@ -347,6 +352,15 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
         # Final layer norm and output head
         x = self.ln_f(x)
         logits = self.lm_head(x)
         # Compute loss if labels provided
         loss = None
@@ -358,21 +372,20 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
             # Multi-token prediction loss
             if self.future_heads is not None:
-                for i, head in enumerate(self.future_heads):
-                    future_logits = head(x[:, :-(i+1)])
                     loss = loss + F.cross_entropy(
-                        future_logits.reshape(-1, self.config.vocab_size),
-                        labels[:, (i+1):].reshape(-1),
                     )
         if not return_dict:
             if labels is not None:
-                return logits, loss
-            return logits, None
         return CausalLMOutput(
             loss=loss,
-            logits=logits,
             hidden_states=all_hidden_states if output_hidden_states else None,
         )

     AuriStream speech language model.
     A GPT-like transformer model for cochlear token prediction with optional
+    multi-token prediction (MTP) heads for future suprisal prediction and speculative decoding.
+    (These heads also improve representation learning).
     Developed by Greta Tuckute and Klemen Kotar.
     """
         self,
         input_ids: Optional[torch.LongTensor] = None,
         labels: Optional[torch.LongTensor] = None,
+        output_logits: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = True,
         # Legacy arguments for compatibility
         Args:
             input_ids: Input token IDs of shape (batch_size, seq_len)
             labels: Target token IDs for computing loss
+            output_logits: Whether to return logits from all prediction heads
             output_hidden_states: Whether to return all hidden states
             return_dict: Whether to return a dict or tuple
             seq: Legacy argument (alias for input_ids)
             tgt: Legacy argument (alias for labels)
         Returns:
+            CausalLMOutput with logits and optional loss. When
+            output_logits=True, logits includes the main LM head first followed
+            by each future prediction head.
         """
         # Handle legacy arguments
         if seq is not None:
         # Final layer norm and output head
         x = self.ln_f(x)
         logits = self.lm_head(x)
+        all_logits = [logits] if output_logits else None
+        future_logits = []
+        if self.future_heads is not None and (labels is not None or output_logits):
+            for i, head in enumerate(self.future_heads):
+                head_logits = head(x[:, :-(i + 1)])
+                future_logits.append(head_logits)
+                if output_logits:
+                    all_logits.append(head_logits)
         # Compute loss if labels provided
         loss = None
             # Multi-token prediction loss
             if self.future_heads is not None:
+                for i, head_logits in enumerate(future_logits):
                     loss = loss + F.cross_entropy(
+                        head_logits.reshape(-1, self.config.vocab_size),
+                        labels[:, (i + 1):].reshape(-1),
                     )
         if not return_dict:
             if labels is not None:
+                return (all_logits if output_logits else logits), loss
+            return (all_logits if output_logits else logits), None
         return CausalLMOutput(
             loss=loss,
+            logits=all_logits if output_logits else logits,
             hidden_states=all_hidden_states if output_hidden_states else None,
         )