support best of n evals

2024-04-24 14:54:03 -04:00 · 2024-04-24 14:54:03 -04:00 · a5fe0960d8
commit a5fe0960d8
parent f9c4dd9c7c
3 changed files with 47 additions and 28 deletions
--- a/backend/routes/evals.py
+++ b/backend/routes/evals.py
@ -7,10 +7,13 @@ from evals.config import EVALS_DIR
 router = APIRouter()
 # Update this if the number of outputs generated per input changes
 N = 1
 class Eval(BaseModel):
    input: str
-    output: str
+    outputs: list[str]
@router.get("/evals")
@ -25,21 +28,27 @@ async def get_evals():
            input_file_path = os.path.join(input_dir, file)
            input_file = await image_to_data_url(input_file_path)
-            # Construct the corresponding output file name
+            # Construct the corresponding output file names
-            output_file_name = file.replace(".png", ".html")
+            output_file_names = [
-            output_file_path = os.path.join(output_dir, output_file_name)
+                file.replace(".png", f"_{i}.html") for i in range(0, N)
            ]  # Assuming 3 outputs for each input
            output_files_data: list[str] = []
            for output_file_name in output_file_names:
                output_file_path = os.path.join(output_dir, output_file_name)
                # Check if the output file exists
                if os.path.exists(output_file_path):
                    with open(output_file_path, "r") as f:
-                    output_file_data = f.read()
+                        output_files_data.append(f.read())
                else:
-                output_file_data = "Output file not found."
+                    output_files_data.append(
                        "<html><h1>Output file not found.</h1></html>"
                    )
            evals.append(
                Eval(
                    input=input_file,
-                    output=output_file_data,
+                    outputs=output_files_data,
                )
            )
--- a/backend/run_evals.py
+++ b/backend/run_evals.py
@ -14,7 +14,8 @@ from evals.core import generate_code_core
 from evals.utils import image_to_data_url
 STACK = "html_tailwind"
-MODEL = Llm.CLAUDE_3_SONNET
+MODEL = Llm.GPT_4_TURBO_2024_04_09
 N = 1  # Number of outputs to generate
 async def main():
@ -28,6 +29,7 @@ async def main():
    for filename in evals:
        filepath = os.path.join(INPUT_DIR, filename)
        data_url = await image_to_data_url(filepath)
        for _ in range(N):  # Generate N tasks for each input
            task = generate_code_core(image_url=data_url, stack=STACK, model=MODEL)
            tasks.append(task)
@ -35,9 +37,13 @@ async def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
-    for filename, content in zip(evals, results):
+    for i, content in enumerate(results):
-        # File name is derived from the original filename in evals
+        # Calculate index for filename and output number
-        output_filename = f"{os.path.splitext(filename)[0]}.html"
+        eval_index = i // N
        output_number = i % N
        filename = evals[eval_index]
        # File name is derived from the original filename in evals with an added output number
        output_filename = f"{os.path.splitext(filename)[0]}_{output_number}.html"
        output_filepath = os.path.join(OUTPUT_DIR, output_filename)
        with open(output_filepath, "w") as file:
            file.write(content)
--- a/frontend/src/components/evals/EvalsPage.tsx
+++ b/frontend/src/components/evals/EvalsPage.tsx
@ -4,7 +4,7 @@ import RatingPicker from "./RatingPicker";
 interface Eval {
  input: string;
-  output: string;
+  outputs: string[];
 }
 function EvalsPage() {
@ -38,18 +38,22 @@ function EvalsPage() {
      <div className="flex flex-col gap-y-4 mt-4 mx-auto justify-center">
        {evals.map((e, index) => (
          <div className="flex flex-col justify-center" key={index}>
-            <div className="flex gap-x-2 justify-center">
+            <h2 className="font-bold text-lg ml-4">{index}</h2>
            <div className="flex gap-x-2 justify-center ml-4">
              {/* Update w if N changes to a fixed number like w-[600px] */}
              <div className="w-1/2 p-1 border">
-                <img src={e.input} />
+                <img src={e.input} alt={`Input for eval ${index}`} />
              </div>
-              <div className="w-1/2 p-1 border">
+              {e.outputs.map((output, outputIndex) => (
                <div className="w-1/2 p-1 border" key={outputIndex}>
                  {/* Put output into an iframe */}
                  <iframe
-                  srcDoc={e.output}
+                    srcDoc={output}
                    className="w-[1200px] h-[800px] transform scale-[0.60]"
                    style={{ transformOrigin: "top left" }}
                  ></iframe>
                </div>
              ))}
            </div>
            <div className="ml-8 mt-4 flex justify-center">
              <RatingPicker