| |
| import time, logging |
| from typing import Any, Dict, AsyncIterable |
|
|
| from transformers import AutoTokenizer, AutoModelForCausalLM |
| from backends_base import ChatBackend, ImagesBackend |
| from config import settings |
|
|
| logger = logging.getLogger(__name__) |
|
|
| try: |
| import spaces |
| except ImportError: |
| spaces = None |
|
|
|
|
| class TransformersChatBackend(ChatBackend): |
| """ |
| Lightweight backend for Hugging Face Spaces (ZeroGPU). |
| Reloads model on every request using Transformers, not vLLM. |
| """ |
|
|
| async def stream(self, request: Dict[str, Any]) -> AsyncIterable[Dict[str, Any]]: |
| messages = request.get("messages", []) |
| prompt = messages[-1]["content"] if messages else "(empty)" |
|
|
| |
| model_id = request.get("model") or settings.LlmHFModelID |
| temperature = float(request.get("temperature", settings.LlmTemp or 0.7)) |
| max_tokens = int(request.get("max_tokens", settings.LlmOpenAICtxSize or 512)) |
|
|
| rid = f"chatcmpl-transformers-{int(time.time())}" |
| now = int(time.time()) |
|
|
| |
| if spaces: |
| @spaces.GPU(duration=300) |
| def run_once(prompt: str) -> str: |
| tokenizer = AutoTokenizer.from_pretrained(model_id) |
| model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto") |
|
|
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
| outputs = model.generate( |
| **inputs, |
| max_new_tokens=max_tokens, |
| temperature=temperature, |
| do_sample=True, |
| ) |
| return tokenizer.decode(outputs[0], skip_special_tokens=True) |
| else: |
| def run_once(prompt: str) -> str: |
| tokenizer = AutoTokenizer.from_pretrained(model_id) |
| model = AutoModelForCausalLM.from_pretrained(model_id) |
|
|
| inputs = tokenizer(prompt, return_tensors="pt") |
| outputs = model.generate( |
| **inputs, |
| max_new_tokens=max_tokens, |
| temperature=temperature, |
| do_sample=True, |
| ) |
| return tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
| try: |
| text = run_once(prompt) |
| yield { |
| "id": rid, |
| "object": "chat.completion.chunk", |
| "created": now, |
| "model": model_id, |
| "choices": [ |
| {"index": 0, "delta": {"content": text}, "finish_reason": "stop"} |
| ], |
| } |
| except Exception: |
| logger.exception("Transformers inference failed") |
| raise |
|
|
|
|
| class StubImagesBackend(ImagesBackend): |
| """ |
| Image generation stub — returns a transparent PNG placeholder. |
| """ |
| async def generate_b64(self, request: Dict[str, Any]) -> str: |
| logger.warning("Image generation not supported in Transformers backend.") |
| return ( |
| "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGP4BwQACfsD/etCJH0AAAAASUVORK5CYII=" |
| ) |
|
|