Spaces:

rak2315
/

ml-debug-env

Running

App Files Files Community

rak2315 commited on Apr 8

Commit

63eddc8

1 Parent(s): 645efc4

fix: 20/20 all tasks 1.0

Browse files

Files changed (7) hide show

.gitignore +2 -0
baseline_results.json +28 -0
inference.py +2 -2
server/__pycache__/app.cpython-310.pyc +0 -0
server/app.py +8 -3
server/baseline_inference.py +4 -2
test2.py +129 -26

.gitignore CHANGED Viewed

@@ -2,3 +2,5 @@
 .venv
 __pycache__
 *.egg-info

 .venv
 __pycache__
 *.egg-info
+test2.py
+test3.py

baseline_results.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "results": [
+    {
+      "task_id": "shape_mismatch",
+      "score": 1.0,
+      "feedback": "Perfect fix. Bug type correct, code runs cleanly, training completes, and success signal confirmed.\nExecution output:\nEpoch 1 complete\nEpoch 2 complete\nEpoch 3 complete\nTraining finished",
+      "bug_type_submitted": "shape_mismatch",
+      "execution_output": "Epoch 1 complete\nEpoch 2 complete\nEpoch 3 complete\nTraining finished"
+    },
+    {
+      "task_id": "training_collapse",
+      "score": 1.0,
+      "feedback": "Perfect fix. Bug type correct, code runs cleanly, training completes, and success signal confirmed.\nExecution output:\nEpoch 1, loss: 1.2506\nEpoch 2, loss: 1.2130\nEpoch 3, loss: 1.1767\nEpoch 4, loss: 1.1394\nEpoch 5, loss: 1.0990\nTraining finished",
+      "bug_type_submitted": "training_collapse",
+      "execution_output": "Epoch 1, loss: 1.2506\nEpoch 2, loss: 1.2130\nEpoch 3, loss: 1.1767\nEpoch 4, loss: 1.1394\nEpoch 5, loss: 1.0990\nTraining finished"
+    },
+    {
+      "task_id": "data_leakage",
+      "score": 1.0,
+      "feedback": "Perfect fix. Bug type correct, code runs cleanly, training completes, and success signal confirmed.\nExecution output:\nTest accuracy: 0.9000\nTraining finished",
+      "bug_type_submitted": "data_leakage",
+      "execution_output": "Test accuracy: 0.9000\nTraining finished"
+    }
+  ],
+  "average_score": 1.0,
+  "model": "llama-3.3-70b-versatile (Groq)",
+  "note": "Baseline uses a single-shot zero-prompt strategy with no examples."
+}

inference.py CHANGED Viewed

@@ -9,8 +9,8 @@ import json
 import urllib.request
 import urllib.error
-HF_SPACE_URL = "https://rak2315-ml-debug-env.hf.space"
 LOCAL_URL     = "http://localhost:8000"
 def hit_baseline(base_url: str, timeout: int = 180) -> dict:
@@ -23,7 +23,7 @@ def hit_baseline(base_url: str, timeout: int = 180) -> dict:
 def main():
     data = None
-    for base_url in [HF_SPACE_URL, LOCAL_URL]:
         try:
             print(f"Connecting to {base_url}/baseline ...", flush=True)
             data = hit_baseline(base_url)

 import urllib.request
 import urllib.error
 LOCAL_URL     = "http://localhost:8000"
+HF_SPACE_URL = "https://rak2315-ml-debug-env.hf.space"
 def hit_baseline(base_url: str, timeout: int = 180) -> dict:
 def main():
     data = None
+    for base_url in [LOCAL_URL, HF_SPACE_URL]:
         try:
             print(f"Connecting to {base_url}/baseline ...", flush=True)
             data = hit_baseline(base_url)

server/__pycache__/app.cpython-310.pyc CHANGED Viewed

Binary files a/server/__pycache__/app.cpython-310.pyc and b/server/__pycache__/app.cpython-310.pyc differ

server/app.py CHANGED Viewed

@@ -167,7 +167,7 @@ async def run_baseline() -> Dict[str, Any]:
     Runs the Groq-based baseline agent against all 3 tasks and returns scores.
     Requires GROQ_API_KEY environment variable.
     """
-    groq_api_key = os.environ.get("GROQ_API_KEY", "")
     if not groq_api_key:
         raise HTTPException(
             status_code=503,
@@ -178,12 +178,17 @@ async def run_baseline() -> Dict[str, Any]:
         )
     try:
         from baseline_inference import run_baseline_on_all_tasks
         results = await asyncio.get_event_loop().run_in_executor(
-            None, run_baseline_on_all_tasks, groq_api_key
         )
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Baseline run failed: {e}")
     avg = sum(r["score"] for r in results) / len(results) if results else 0.0

     Runs the Groq-based baseline agent against all 3 tasks and returns scores.
     Requires GROQ_API_KEY environment variable.
     """
+    groq_api_key = os.environ.get("GROQ_API_KEY", "").strip()
     if not groq_api_key:
         raise HTTPException(
             status_code=503,
         )
     try:
+        server_dir = os.path.dirname(os.path.abspath(__file__))
+        if server_dir not in sys.path:
+            sys.path.insert(0, server_dir)
         from baseline_inference import run_baseline_on_all_tasks
+        base_url = os.environ.get("API_BASE_URL") or "https://api.groq.com/openai/v1"
         results = await asyncio.get_event_loop().run_in_executor(
+            None, run_baseline_on_all_tasks, groq_api_key, base_url
         )
     except Exception as e:
+        import traceback
+        raise HTTPException(status_code=500, detail=f"Baseline run failed: {e}\n{traceback.format_exc()}")
     avg = sum(r["score"] for r in results) / len(results) if results else 0.0

server/baseline_inference.py CHANGED Viewed

@@ -11,7 +11,9 @@ import os
 import sys
 import json
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from openai import OpenAI
@@ -132,7 +134,7 @@ def run_baseline_on_all_tasks(api_key: str, base_url: str) -> list:
 if __name__ == "__main__":
     # Use injected proxy creds if available, fall back to Groq for local dev
-    api_key  = os.environ.get("API_KEY")  or os.environ.get("GROQ_API_KEY", "")
     base_url = os.environ.get("API_BASE_URL") or GROQ_BASE_URL
     if not api_key:

 import sys
 import json
+_server_dir = os.path.dirname(os.path.abspath(__file__))
+if _server_dir not in sys.path:
+    sys.path.insert(0, _server_dir)
 from openai import OpenAI
 if __name__ == "__main__":
     # Use injected proxy creds if available, fall back to Groq for local dev
+    api_key  = (os.environ.get("API_KEY") or os.environ.get("GROQ_API_KEY", "")).strip()
     base_url = os.environ.get("API_BASE_URL") or GROQ_BASE_URL
     if not api_key:

test2.py CHANGED Viewed

@@ -1,31 +1,134 @@
-# test_hf.py
-# run from ml_debug_env/: python test_hf.py
-import subprocess
-import sys
 import os
-print(f"Python: {sys.executable}")
-print(f"Version: {sys.version}")
-# check huggingface_hub installed
 try:
-    import huggingface_hub
-    print(f"huggingface_hub version: {huggingface_hub.__version__}")
-except ImportError:
-    print("huggingface_hub NOT installed")
-    sys.exit(1)
-# check what CLI scripts exist in venv
-scripts_dir = os.path.join(os.path.dirname(sys.executable))
-print(f"\nScripts dir: {scripts_dir}")
-hf_scripts = [f for f in os.listdir(scripts_dir) if "hugging" in f.lower() or "hf" in f.lower()]
-print(f"HF-related scripts found: {hf_scripts}")
-# try login
-print("\nAttempting HuggingFace login...")
-print("Go to https://huggingface.co/settings/tokens and create a WRITE token")
-print("Paste it below:\n")
-from huggingface_hub import login
-login()
-print("\nLogin successful!")

+# test_submission.py
+# Run this before every submission to catch validator failures early.
+# Usage: python test_submission.py
 import os
+import sys
+import json
+import subprocess
+import urllib.request
+import urllib.error
+import time
+LOCAL_URL = "http://localhost:8000"
+HF_URL    = "https://rak2315-ml-debug-env.hf.space"
+PASS = "\033[92m[PASS]\033[0m"
+FAIL = "\033[91m[FAIL]\033[0m"
+WARN = "\033[93m[WARN]\033[0m"
+results = []
+def check(name, passed, detail=""):
+    icon = PASS if passed else FAIL
+    print(f"{icon} {name}" + (f" — {detail}" if detail else ""))
+    results.append((name, passed))
+# ── 1. inference.py exists ────────────────────────────────────────────────────
+check("inference.py exists at repo root", os.path.exists("inference.py"))
+# ── 2. baseline_inference.py reads API_BASE_URL / API_KEY ────────────────────
+bi_path = os.path.join("server", "baseline_inference.py")
+if os.path.exists(bi_path):
+    content = open(bi_path).read()
+    uses_api_base = "API_BASE_URL" in content
+    uses_api_key  = "API_KEY" in content
+    check("baseline_inference.py uses API_BASE_URL", uses_api_base)
+    check("baseline_inference.py uses API_KEY", uses_api_key)
+    check("baseline_inference.py does NOT hardcode Groq URL only",
+          "API_BASE_URL" in content,
+          "must prefer injected base_url over hardcoded Groq")
+else:
+    check("server/baseline_inference.py exists", False)
+# ── 3. inference.py tries localhost first ─────────────────────────────────────
+infer_content = open("inference.py").read() if os.path.exists("inference.py") else ""
+localhost_pos = infer_content.find("localhost")
+hf_pos        = infer_content.find("hf.space")
+if localhost_pos != -1 and hf_pos != -1:
+    check("inference.py tries localhost before HF Space", localhost_pos < hf_pos)
+elif localhost_pos != -1:
+    check("inference.py tries localhost", True)
+else:
+    check("inference.py tries localhost", False, "only HF Space URL found — validator can't reach it")
+# ── 4. Run inference.py and check structured output ──────────────────────────
+print("\n── Running inference.py ──")
+env = os.environ.copy()
+env["API_BASE_URL"] = os.environ.get("API_BASE_URL", "http://localhost:8000/v1")
+env["API_KEY"]      = os.environ.get("API_KEY", "test-key")
+proc = subprocess.run(
+    [sys.executable, "inference.py"],
+    capture_output=True, text=True, timeout=60, env=env
+)
+stdout = proc.stdout
+stderr = proc.stderr
+print("STDOUT:\n", stdout[:2000] if stdout else "(empty)")
+if stderr:
+    print("STDERR:\n", stderr[:500])
+check("inference.py exits with code 0", proc.returncode == 0,
+      f"exit code {proc.returncode}")
+check("[START] found in stdout", "[START]" in stdout)
+check("[STEP]  found in stdout", "[STEP]"  in stdout)
+check("[END]   found in stdout", "[END]"   in stdout)
+# Parse and validate blocks
+tasks_found = []
+for line in stdout.splitlines():
+    if line.startswith("[END]"):
+        parts = dict(p.split("=") for p in line[5:].strip().split() if "=" in p)
+        tasks_found.append(parts)
+check("At least 3 [END] blocks found", len(tasks_found) >= 3,
+      f"found {len(tasks_found)}")
+for t in tasks_found:
+    tid   = t.get("task", "?")
+    score = float(t.get("score", -1))
+    check(f"Task {tid} score in [0.0, 1.0]", 0.0 <= score <= 1.0, f"score={score}")
+# ── 5. Check local server is reachable ───────────────────────────────────────
+print("\n── Checking local server ──")
 try:
+    with urllib.request.urlopen(f"{LOCAL_URL}/health", timeout=5) as r:
+        body = json.loads(r.read())
+    check("GET /health returns 200", True, str(body))
+except Exception as e:
+    check("GET /health reachable", False, str(e))
+    print(f"  {WARN} Start your server: uvicorn server.app:app --host 0.0.0.0 --port 8000")
+try:
+    with urllib.request.urlopen(f"{LOCAL_URL}/tasks", timeout=5) as r:
+        tasks = json.loads(r.read())
+    check("GET /tasks returns task list", isinstance(tasks, (list, dict)), str(tasks)[:80])
+except Exception as e:
+    check("GET /tasks reachable", False, str(e))
+# ── 6. Check /baseline endpoint ──────────────────────────────────────────────
+print("\n── Checking /baseline endpoint ──")
+try:
+    with urllib.request.urlopen(f"{LOCAL_URL}/baseline", timeout=120) as r:
+        data = json.loads(r.read())
+    bres = data.get("results", [])
+    avg  = data.get("average_score", 0)
+    check("/baseline returns results list", len(bres) > 0, f"{len(bres)} tasks")
+    check("/baseline average_score present", "average_score" in data, f"avg={avg:.3f}")
+    for r in bres:
+        check(f"  task {r['task_id']} score valid", 0.0 <= r['score'] <= 1.0,
+              f"score={r['score']}")
+except Exception as e:
+    check("/baseline reachable", False, str(e))
+    print(f"  {WARN} Make sure GROQ_API_KEY or API_KEY is set and server is running")
+# ── Summary ───────────────────────────────────────────────────────────────────
+print("\n══ SUMMARY ══════════════════════════════")
+passed = sum(1 for _, p in results if p)
+total  = len(results)
+print(f"{passed}/{total} checks passed")
+if passed == total:
+    print(f"{PASS} Ready to submit!")
+else:
+    print(f"{FAIL} Fix the above before submitting.")