rak2315 commited on
Commit
63eddc8
Β·
1 Parent(s): 645efc4

fix: 20/20 all tasks 1.0

Browse files
.gitignore CHANGED
@@ -2,3 +2,5 @@
2
  .venv
3
  __pycache__
4
  *.egg-info
 
 
 
2
  .venv
3
  __pycache__
4
  *.egg-info
5
+ test2.py
6
+ test3.py
baseline_results.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_id": "shape_mismatch",
5
+ "score": 1.0,
6
+ "feedback": "Perfect fix. Bug type correct, code runs cleanly, training completes, and success signal confirmed.\nExecution output:\nEpoch 1 complete\nEpoch 2 complete\nEpoch 3 complete\nTraining finished",
7
+ "bug_type_submitted": "shape_mismatch",
8
+ "execution_output": "Epoch 1 complete\nEpoch 2 complete\nEpoch 3 complete\nTraining finished"
9
+ },
10
+ {
11
+ "task_id": "training_collapse",
12
+ "score": 1.0,
13
+ "feedback": "Perfect fix. Bug type correct, code runs cleanly, training completes, and success signal confirmed.\nExecution output:\nEpoch 1, loss: 1.2506\nEpoch 2, loss: 1.2130\nEpoch 3, loss: 1.1767\nEpoch 4, loss: 1.1394\nEpoch 5, loss: 1.0990\nTraining finished",
14
+ "bug_type_submitted": "training_collapse",
15
+ "execution_output": "Epoch 1, loss: 1.2506\nEpoch 2, loss: 1.2130\nEpoch 3, loss: 1.1767\nEpoch 4, loss: 1.1394\nEpoch 5, loss: 1.0990\nTraining finished"
16
+ },
17
+ {
18
+ "task_id": "data_leakage",
19
+ "score": 1.0,
20
+ "feedback": "Perfect fix. Bug type correct, code runs cleanly, training completes, and success signal confirmed.\nExecution output:\nTest accuracy: 0.9000\nTraining finished",
21
+ "bug_type_submitted": "data_leakage",
22
+ "execution_output": "Test accuracy: 0.9000\nTraining finished"
23
+ }
24
+ ],
25
+ "average_score": 1.0,
26
+ "model": "llama-3.3-70b-versatile (Groq)",
27
+ "note": "Baseline uses a single-shot zero-prompt strategy with no examples."
28
+ }
inference.py CHANGED
@@ -9,8 +9,8 @@ import json
9
  import urllib.request
10
  import urllib.error
11
 
12
- HF_SPACE_URL = "https://rak2315-ml-debug-env.hf.space"
13
  LOCAL_URL = "http://localhost:8000"
 
14
 
15
 
16
  def hit_baseline(base_url: str, timeout: int = 180) -> dict:
@@ -23,7 +23,7 @@ def hit_baseline(base_url: str, timeout: int = 180) -> dict:
23
 
24
  def main():
25
  data = None
26
- for base_url in [HF_SPACE_URL, LOCAL_URL]:
27
  try:
28
  print(f"Connecting to {base_url}/baseline ...", flush=True)
29
  data = hit_baseline(base_url)
 
9
  import urllib.request
10
  import urllib.error
11
 
 
12
  LOCAL_URL = "http://localhost:8000"
13
+ HF_SPACE_URL = "https://rak2315-ml-debug-env.hf.space"
14
 
15
 
16
  def hit_baseline(base_url: str, timeout: int = 180) -> dict:
 
23
 
24
  def main():
25
  data = None
26
+ for base_url in [LOCAL_URL, HF_SPACE_URL]:
27
  try:
28
  print(f"Connecting to {base_url}/baseline ...", flush=True)
29
  data = hit_baseline(base_url)
server/__pycache__/app.cpython-310.pyc CHANGED
Binary files a/server/__pycache__/app.cpython-310.pyc and b/server/__pycache__/app.cpython-310.pyc differ
 
server/app.py CHANGED
@@ -167,7 +167,7 @@ async def run_baseline() -> Dict[str, Any]:
167
  Runs the Groq-based baseline agent against all 3 tasks and returns scores.
168
  Requires GROQ_API_KEY environment variable.
169
  """
170
- groq_api_key = os.environ.get("GROQ_API_KEY", "")
171
  if not groq_api_key:
172
  raise HTTPException(
173
  status_code=503,
@@ -178,12 +178,17 @@ async def run_baseline() -> Dict[str, Any]:
178
  )
179
 
180
  try:
 
 
 
181
  from baseline_inference import run_baseline_on_all_tasks
 
182
  results = await asyncio.get_event_loop().run_in_executor(
183
- None, run_baseline_on_all_tasks, groq_api_key
184
  )
185
  except Exception as e:
186
- raise HTTPException(status_code=500, detail=f"Baseline run failed: {e}")
 
187
 
188
  avg = sum(r["score"] for r in results) / len(results) if results else 0.0
189
 
 
167
  Runs the Groq-based baseline agent against all 3 tasks and returns scores.
168
  Requires GROQ_API_KEY environment variable.
169
  """
170
+ groq_api_key = os.environ.get("GROQ_API_KEY", "").strip()
171
  if not groq_api_key:
172
  raise HTTPException(
173
  status_code=503,
 
178
  )
179
 
180
  try:
181
+ server_dir = os.path.dirname(os.path.abspath(__file__))
182
+ if server_dir not in sys.path:
183
+ sys.path.insert(0, server_dir)
184
  from baseline_inference import run_baseline_on_all_tasks
185
+ base_url = os.environ.get("API_BASE_URL") or "https://api.groq.com/openai/v1"
186
  results = await asyncio.get_event_loop().run_in_executor(
187
+ None, run_baseline_on_all_tasks, groq_api_key, base_url
188
  )
189
  except Exception as e:
190
+ import traceback
191
+ raise HTTPException(status_code=500, detail=f"Baseline run failed: {e}\n{traceback.format_exc()}")
192
 
193
  avg = sum(r["score"] for r in results) / len(results) if results else 0.0
194
 
server/baseline_inference.py CHANGED
@@ -11,7 +11,9 @@ import os
11
  import sys
12
  import json
13
 
14
- sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 
15
 
16
  from openai import OpenAI
17
 
@@ -132,7 +134,7 @@ def run_baseline_on_all_tasks(api_key: str, base_url: str) -> list:
132
 
133
  if __name__ == "__main__":
134
  # Use injected proxy creds if available, fall back to Groq for local dev
135
- api_key = os.environ.get("API_KEY") or os.environ.get("GROQ_API_KEY", "")
136
  base_url = os.environ.get("API_BASE_URL") or GROQ_BASE_URL
137
 
138
  if not api_key:
 
11
  import sys
12
  import json
13
 
14
+ _server_dir = os.path.dirname(os.path.abspath(__file__))
15
+ if _server_dir not in sys.path:
16
+ sys.path.insert(0, _server_dir)
17
 
18
  from openai import OpenAI
19
 
 
134
 
135
  if __name__ == "__main__":
136
  # Use injected proxy creds if available, fall back to Groq for local dev
137
+ api_key = (os.environ.get("API_KEY") or os.environ.get("GROQ_API_KEY", "")).strip()
138
  base_url = os.environ.get("API_BASE_URL") or GROQ_BASE_URL
139
 
140
  if not api_key:
test2.py CHANGED
@@ -1,31 +1,134 @@
1
- # test_hf.py
2
- # run from ml_debug_env/: python test_hf.py
 
3
 
4
- import subprocess
5
- import sys
6
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- print(f"Python: {sys.executable}")
9
- print(f"Version: {sys.version}")
 
 
 
10
 
11
- # check huggingface_hub installed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  try:
13
- import huggingface_hub
14
- print(f"huggingface_hub version: {huggingface_hub.__version__}")
15
- except ImportError:
16
- print("huggingface_hub NOT installed")
17
- sys.exit(1)
18
-
19
- # check what CLI scripts exist in venv
20
- scripts_dir = os.path.join(os.path.dirname(sys.executable))
21
- print(f"\nScripts dir: {scripts_dir}")
22
- hf_scripts = [f for f in os.listdir(scripts_dir) if "hugging" in f.lower() or "hf" in f.lower()]
23
- print(f"HF-related scripts found: {hf_scripts}")
24
-
25
- # try login
26
- print("\nAttempting HuggingFace login...")
27
- print("Go to https://huggingface.co/settings/tokens and create a WRITE token")
28
- print("Paste it below:\n")
29
- from huggingface_hub import login
30
- login()
31
- print("\nLogin successful!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # test_submission.py
2
+ # Run this before every submission to catch validator failures early.
3
+ # Usage: python test_submission.py
4
 
 
 
5
  import os
6
+ import sys
7
+ import json
8
+ import subprocess
9
+ import urllib.request
10
+ import urllib.error
11
+ import time
12
+
13
+ LOCAL_URL = "http://localhost:8000"
14
+ HF_URL = "https://rak2315-ml-debug-env.hf.space"
15
+
16
+ PASS = "\033[92m[PASS]\033[0m"
17
+ FAIL = "\033[91m[FAIL]\033[0m"
18
+ WARN = "\033[93m[WARN]\033[0m"
19
+
20
+ results = []
21
+
22
+ def check(name, passed, detail=""):
23
+ icon = PASS if passed else FAIL
24
+ print(f"{icon} {name}" + (f" β€” {detail}" if detail else ""))
25
+ results.append((name, passed))
26
+
27
+ # ── 1. inference.py exists ────────────────────────────────────────────────────
28
+ check("inference.py exists at repo root", os.path.exists("inference.py"))
29
+
30
+ # ── 2. baseline_inference.py reads API_BASE_URL / API_KEY ────────────────────
31
+ bi_path = os.path.join("server", "baseline_inference.py")
32
+ if os.path.exists(bi_path):
33
+ content = open(bi_path).read()
34
+ uses_api_base = "API_BASE_URL" in content
35
+ uses_api_key = "API_KEY" in content
36
+ check("baseline_inference.py uses API_BASE_URL", uses_api_base)
37
+ check("baseline_inference.py uses API_KEY", uses_api_key)
38
+ check("baseline_inference.py does NOT hardcode Groq URL only",
39
+ "API_BASE_URL" in content,
40
+ "must prefer injected base_url over hardcoded Groq")
41
+ else:
42
+ check("server/baseline_inference.py exists", False)
43
+
44
+ # ── 3. inference.py tries localhost first ─────────────────────────────────────
45
+ infer_content = open("inference.py").read() if os.path.exists("inference.py") else ""
46
+ localhost_pos = infer_content.find("localhost")
47
+ hf_pos = infer_content.find("hf.space")
48
+ if localhost_pos != -1 and hf_pos != -1:
49
+ check("inference.py tries localhost before HF Space", localhost_pos < hf_pos)
50
+ elif localhost_pos != -1:
51
+ check("inference.py tries localhost", True)
52
+ else:
53
+ check("inference.py tries localhost", False, "only HF Space URL found β€” validator can't reach it")
54
 
55
+ # ── 4. Run inference.py and check structured output ──────────────────────────
56
+ print("\n── Running inference.py ──")
57
+ env = os.environ.copy()
58
+ env["API_BASE_URL"] = os.environ.get("API_BASE_URL", "http://localhost:8000/v1")
59
+ env["API_KEY"] = os.environ.get("API_KEY", "test-key")
60
 
61
+ proc = subprocess.run(
62
+ [sys.executable, "inference.py"],
63
+ capture_output=True, text=True, timeout=60, env=env
64
+ )
65
+ stdout = proc.stdout
66
+ stderr = proc.stderr
67
+
68
+ print("STDOUT:\n", stdout[:2000] if stdout else "(empty)")
69
+ if stderr:
70
+ print("STDERR:\n", stderr[:500])
71
+
72
+ check("inference.py exits with code 0", proc.returncode == 0,
73
+ f"exit code {proc.returncode}")
74
+ check("[START] found in stdout", "[START]" in stdout)
75
+ check("[STEP] found in stdout", "[STEP]" in stdout)
76
+ check("[END] found in stdout", "[END]" in stdout)
77
+
78
+ # Parse and validate blocks
79
+ tasks_found = []
80
+ for line in stdout.splitlines():
81
+ if line.startswith("[END]"):
82
+ parts = dict(p.split("=") for p in line[5:].strip().split() if "=" in p)
83
+ tasks_found.append(parts)
84
+
85
+ check("At least 3 [END] blocks found", len(tasks_found) >= 3,
86
+ f"found {len(tasks_found)}")
87
+
88
+ for t in tasks_found:
89
+ tid = t.get("task", "?")
90
+ score = float(t.get("score", -1))
91
+ check(f"Task {tid} score in [0.0, 1.0]", 0.0 <= score <= 1.0, f"score={score}")
92
+
93
+ # ── 5. Check local server is reachable ───────────────────────────────────────
94
+ print("\n── Checking local server ──")
95
  try:
96
+ with urllib.request.urlopen(f"{LOCAL_URL}/health", timeout=5) as r:
97
+ body = json.loads(r.read())
98
+ check("GET /health returns 200", True, str(body))
99
+ except Exception as e:
100
+ check("GET /health reachable", False, str(e))
101
+ print(f" {WARN} Start your server: uvicorn server.app:app --host 0.0.0.0 --port 8000")
102
+
103
+ try:
104
+ with urllib.request.urlopen(f"{LOCAL_URL}/tasks", timeout=5) as r:
105
+ tasks = json.loads(r.read())
106
+ check("GET /tasks returns task list", isinstance(tasks, (list, dict)), str(tasks)[:80])
107
+ except Exception as e:
108
+ check("GET /tasks reachable", False, str(e))
109
+
110
+ # ── 6. Check /baseline endpoint ──────────────────────────────────────────────
111
+ print("\n── Checking /baseline endpoint ──")
112
+ try:
113
+ with urllib.request.urlopen(f"{LOCAL_URL}/baseline", timeout=120) as r:
114
+ data = json.loads(r.read())
115
+ bres = data.get("results", [])
116
+ avg = data.get("average_score", 0)
117
+ check("/baseline returns results list", len(bres) > 0, f"{len(bres)} tasks")
118
+ check("/baseline average_score present", "average_score" in data, f"avg={avg:.3f}")
119
+ for r in bres:
120
+ check(f" task {r['task_id']} score valid", 0.0 <= r['score'] <= 1.0,
121
+ f"score={r['score']}")
122
+ except Exception as e:
123
+ check("/baseline reachable", False, str(e))
124
+ print(f" {WARN} Make sure GROQ_API_KEY or API_KEY is set and server is running")
125
+
126
+ # ── Summary ───────────────────────────────────────────────────────────────────
127
+ print("\n══ SUMMARY ══════════════════════════════")
128
+ passed = sum(1 for _, p in results if p)
129
+ total = len(results)
130
+ print(f"{passed}/{total} checks passed")
131
+ if passed == total:
132
+ print(f"{PASS} Ready to submit!")
133
+ else:
134
+ print(f"{FAIL} Fix the above before submitting.")