From a5fe0960d825dc12959ec901302cc880d02eee8a Mon Sep 17 00:00:00 2001
From: Abi Raja <abimanyuraja@gmail.com>
Date: Wed, 24 Apr 2024 14:54:03 -0400
Subject: [PATCH 1/3] support best of n evals

---
 backend/routes/evals.py                     | 31 +++++++++++++--------
 backend/run_evals.py                        | 18 ++++++++----
 frontend/src/components/evals/EvalsPage.tsx | 26 +++++++++--------
 3 files changed, 47 insertions(+), 28 deletions(-)
diff --git a/backend/routes/evals.py b/backend/routes/evals.py
index 798a9d8..22262cd 100644
--- a/backend/routes/evals.py
+++ b/backend/routes/evals.py
@@ -7,10 +7,13 @@ from evals.config import EVALS_DIR
 
 router = APIRouter()
 
+# Update this if the number of outputs generated per input changes
+N = 1
+
 
 class Eval(BaseModel):
     input: str
-    output: str
+    outputs: list[str]
 
 
 @router.get("/evals")
@@ -25,21 +28,27 @@ async def get_evals():
             input_file_path = os.path.join(input_dir, file)
             input_file = await image_to_data_url(input_file_path)
 
-            # Construct the corresponding output file name
-            output_file_name = file.replace(".png", ".html")
-            output_file_path = os.path.join(output_dir, output_file_name)
+            # Construct the corresponding output file names
+            output_file_names = [
+                file.replace(".png", f"_{i}.html") for i in range(0, N)
+            ]  # Assuming 3 outputs for each input
 
-            # Check if the output file exists
-            if os.path.exists(output_file_path):
-                with open(output_file_path, "r") as f:
-                    output_file_data = f.read()
-            else:
-                output_file_data = "Output file not found."
+            output_files_data: list[str] = []
+            for output_file_name in output_file_names:
+                output_file_path = os.path.join(output_dir, output_file_name)
+                # Check if the output file exists
+                if os.path.exists(output_file_path):
+                    with open(output_file_path, "r") as f:
+                        output_files_data.append(f.read())
+                else:
+                    output_files_data.append(
+                        "<html><h1>Output file not found.</h1></html>"
+                    )
 
             evals.append(
                 Eval(
                     input=input_file,
-                    output=output_file_data,
+                    outputs=output_files_data,
                 )
             )
 
diff --git a/backend/run_evals.py b/backend/run_evals.py
index f26c708..a5fa878 100644
--- a/backend/run_evals.py
+++ b/backend/run_evals.py
@@ -14,7 +14,8 @@ from evals.core import generate_code_core
 from evals.utils import image_to_data_url
 
 STACK = "html_tailwind"
-MODEL = Llm.CLAUDE_3_SONNET
+MODEL = Llm.GPT_4_TURBO_2024_04_09
+N = 1  # Number of outputs to generate
 
 
 async def main():
@@ -28,16 +29,21 @@ async def main():
     for filename in evals:
         filepath = os.path.join(INPUT_DIR, filename)
         data_url = await image_to_data_url(filepath)
-        task = generate_code_core(image_url=data_url, stack=STACK, model=MODEL)
-        tasks.append(task)
+        for _ in range(N):  # Generate N tasks for each input
+            task = generate_code_core(image_url=data_url, stack=STACK, model=MODEL)
+            tasks.append(task)
 
     results = await asyncio.gather(*tasks)
 
     os.makedirs(OUTPUT_DIR, exist_ok=True)
 
-    for filename, content in zip(evals, results):
-        # File name is derived from the original filename in evals
-        output_filename = f"{os.path.splitext(filename)[0]}.html"
+    for i, content in enumerate(results):
+        # Calculate index for filename and output number
+        eval_index = i // N
+        output_number = i % N
+        filename = evals[eval_index]
+        # File name is derived from the original filename in evals with an added output number
+        output_filename = f"{os.path.splitext(filename)[0]}_{output_number}.html"
         output_filepath = os.path.join(OUTPUT_DIR, output_filename)
         with open(output_filepath, "w") as file:
             file.write(content)
diff --git a/frontend/src/components/evals/EvalsPage.tsx b/frontend/src/components/evals/EvalsPage.tsx
index 6e76a0d..6d2adbd 100644
--- a/frontend/src/components/evals/EvalsPage.tsx
+++ b/frontend/src/components/evals/EvalsPage.tsx
@@ -4,7 +4,7 @@ import RatingPicker from "./RatingPicker";
 
 interface Eval {
   input: string;
-  output: string;
+  outputs: string[];
 }
 
 function EvalsPage() {
@@ -38,18 +38,22 @@ function EvalsPage() {
       <div className="flex flex-col gap-y-4 mt-4 mx-auto justify-center">
         {evals.map((e, index) => (
           <div className="flex flex-col justify-center" key={index}>
-            <div className="flex gap-x-2 justify-center">
+            <h2 className="font-bold text-lg ml-4">{index}</h2>
+            <div className="flex gap-x-2 justify-center ml-4">
+              {/* Update w if N changes to a fixed number like w-[600px] */}
               <div className="w-1/2 p-1 border">
-                <img src={e.input} />
-              </div>
-              <div className="w-1/2 p-1 border">
-                {/* Put output into an iframe */}
-                <iframe
-                  srcDoc={e.output}
-                  className="w-[1200px] h-[800px] transform scale-[0.60]"
-                  style={{ transformOrigin: "top left" }}
-                ></iframe>
+                <img src={e.input} alt={`Input for eval ${index}`} />
               </div>
+              {e.outputs.map((output, outputIndex) => (
+                <div className="w-1/2 p-1 border" key={outputIndex}>
+                  {/* Put output into an iframe */}
+                  <iframe
+                    srcDoc={output}
+                    className="w-[1200px] h-[800px] transform scale-[0.60]"
+                    style={{ transformOrigin: "top left" }}
+                  ></iframe>
+                </div>
+              ))}
             </div>
             <div className="ml-8 mt-4 flex justify-center">
               <RatingPicker

From 8e6a9c48f8366e7b28b2a088b902cfdd755d98c4 Mon Sep 17 00:00:00 2001
From: Abi Raja <abimanyuraja@gmail.com>
Date: Mon, 13 May 2024 15:24:47 -0400
Subject: [PATCH 2/3] support GPT-4o

---
 backend/llm.py                  | 7 ++++++-
 backend/routes/generate_code.py | 3 ++-
 backend/run_evals.py            | 4 ++--
 frontend/src/App.tsx            | 2 +-
 frontend/src/lib/models.ts      | 3 +++
 5 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/backend/llm.py b/backend/llm.py
index 3d653b2..e541046 100644
--- a/backend/llm.py
+++ b/backend/llm.py
@@ -13,6 +13,7 @@ from utils import pprint_prompt
 class Llm(Enum):
     GPT_4_VISION = "gpt-4-vision-preview"
     GPT_4_TURBO_2024_04_09 = "gpt-4-turbo-2024-04-09"
+    GPT_4O_2024_05_13 = "gpt-4o-2024-05-13"
     CLAUDE_3_SONNET = "claude-3-sonnet-20240229"
     CLAUDE_3_OPUS = "claude-3-opus-20240229"
     CLAUDE_3_HAIKU = "claude-3-haiku-20240307"
@@ -47,7 +48,11 @@ async def stream_openai_response(
     }
 
     # Add 'max_tokens' only if the model is a GPT4 vision or Turbo model
-    if model == Llm.GPT_4_VISION or model == Llm.GPT_4_TURBO_2024_04_09:
+    if (
+        model == Llm.GPT_4_VISION
+        or model == Llm.GPT_4_TURBO_2024_04_09
+        or model == Llm.GPT_4O_2024_05_13
+    ):
         params["max_tokens"] = 4096
 
     stream = await client.chat.completions.create(**params)  # type: ignore
diff --git a/backend/routes/generate_code.py b/backend/routes/generate_code.py
index fa5c7a5..e7186fc 100644
--- a/backend/routes/generate_code.py
+++ b/backend/routes/generate_code.py
@@ -85,7 +85,7 @@ async def stream_code(websocket: WebSocket):
 
     # Read the model from the request. Fall back to default if not provided.
     code_generation_model_str = params.get(
-        "codeGenerationModel", Llm.GPT_4_VISION.value
+        "codeGenerationModel", Llm.GPT_4O_2024_05_13.value
     )
     try:
         code_generation_model = convert_frontend_str_to_llm(code_generation_model_str)
@@ -112,6 +112,7 @@ async def stream_code(websocket: WebSocket):
     if not openai_api_key and (
         code_generation_model == Llm.GPT_4_VISION
         or code_generation_model == Llm.GPT_4_TURBO_2024_04_09
+        or code_generation_model == Llm.GPT_4O_2024_05_13
     ):
         print("OpenAI API key not found")
         await throw_error(
diff --git a/backend/run_evals.py b/backend/run_evals.py
index a5fa878..bbf355a 100644
--- a/backend/run_evals.py
+++ b/backend/run_evals.py
@@ -13,8 +13,8 @@ from evals.config import EVALS_DIR
 from evals.core import generate_code_core
 from evals.utils import image_to_data_url
 
-STACK = "html_tailwind"
-MODEL = Llm.GPT_4_TURBO_2024_04_09
+STACK = "ionic_tailwind"
+MODEL = Llm.GPT_4O_2024_05_13
 N = 1  # Number of outputs to generate
 
 
diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx
index eef3a11..33c02ca 100644
--- a/frontend/src/App.tsx
+++ b/frontend/src/App.tsx
@@ -63,7 +63,7 @@ function App() {
       isImageGenerationEnabled: true,
       editorTheme: EditorTheme.COBALT,
       generatedCodeConfig: Stack.HTML_TAILWIND,
-      codeGenerationModel: CodeGenerationModel.GPT_4_TURBO_2024_04_09,
+      codeGenerationModel: CodeGenerationModel.GPT_4O_2024_05_13,
       // Only relevant for hosted version
       isTermOfServiceAccepted: false,
     },
diff --git a/frontend/src/lib/models.ts b/frontend/src/lib/models.ts
index ab82e23..970e63b 100644
--- a/frontend/src/lib/models.ts
+++ b/frontend/src/lib/models.ts
@@ -1,5 +1,7 @@
 // Keep in sync with backend (llm.py)
+// Order here matches dropdown order
 export enum CodeGenerationModel {
+  GPT_4O_2024_05_13 = "gpt-4o-2024-05-13",
   GPT_4_TURBO_2024_04_09 = "gpt-4-turbo-2024-04-09",
   GPT_4_VISION = "gpt_4_vision",
   CLAUDE_3_SONNET = "claude_3_sonnet",
@@ -9,6 +11,7 @@ export enum CodeGenerationModel {
 export const CODE_GENERATION_MODEL_DESCRIPTIONS: {
   [key in CodeGenerationModel]: { name: string; inBeta: boolean };
 } = {
+  "gpt-4o-2024-05-13": { name: "GPT-4O 🌟", inBeta: false },
   "gpt-4-turbo-2024-04-09": { name: "GPT-4 Turbo (Apr 2024)", inBeta: false },
   gpt_4_vision: { name: "GPT-4 Vision (Nov 2023)", inBeta: false },
   claude_3_sonnet: { name: "Claude 3 Sonnet", inBeta: false },

From 43214bbbf894106b26e362602b0a7ca4c4e292ac Mon Sep 17 00:00:00 2001
From: Abi Raja <abimanyuraja@gmail.com>
Date: Mon, 13 May 2024 15:26:12 -0400
Subject: [PATCH 3/3] add another unit test for new model

---
 backend/test_llm.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/backend/test_llm.py b/backend/test_llm.py
index ec005a3..aeb02ab 100644
--- a/backend/test_llm.py
+++ b/backend/test_llm.py
@@ -24,6 +24,11 @@ class TestConvertFrontendStrToLlm(unittest.TestCase):
             Llm.GPT_4_TURBO_2024_04_09,
             "Should convert 'gpt-4-turbo-2024-04-09' to Llm.GPT_4_TURBO_2024_04_09",
         )
+        self.assertEqual(
+            convert_frontend_str_to_llm("gpt-4o-2024-05-13"),
+            Llm.GPT_4O_2024_05_13,
+            "Should convert 'gpt-4o-2024-05-13' to Llm.GPT_4O_2024_05_13",
+        )
 
     def test_convert_invalid_string_raises_exception(self):
         with self.assertRaises(ValueError):