From a5fe0960d825dc12959ec901302cc880d02eee8a Mon Sep 17 00:00:00 2001 From: Abi Raja Date: Wed, 24 Apr 2024 14:54:03 -0400 Subject: [PATCH 1/3] support best of n evals --- backend/routes/evals.py | 31 +++++++++++++-------- backend/run_evals.py | 18 ++++++++---- frontend/src/components/evals/EvalsPage.tsx | 26 +++++++++-------- 3 files changed, 47 insertions(+), 28 deletions(-) diff --git a/backend/routes/evals.py b/backend/routes/evals.py index 798a9d8..22262cd 100644 --- a/backend/routes/evals.py +++ b/backend/routes/evals.py @@ -7,10 +7,13 @@ from evals.config import EVALS_DIR router = APIRouter() +# Update this if the number of outputs generated per input changes +N = 1 + class Eval(BaseModel): input: str - output: str + outputs: list[str] @router.get("/evals") @@ -25,21 +28,27 @@ async def get_evals(): input_file_path = os.path.join(input_dir, file) input_file = await image_to_data_url(input_file_path) - # Construct the corresponding output file name - output_file_name = file.replace(".png", ".html") - output_file_path = os.path.join(output_dir, output_file_name) + # Construct the corresponding output file names + output_file_names = [ + file.replace(".png", f"_{i}.html") for i in range(0, N) + ] # Assuming 3 outputs for each input - # Check if the output file exists - if os.path.exists(output_file_path): - with open(output_file_path, "r") as f: - output_file_data = f.read() - else: - output_file_data = "Output file not found." + output_files_data: list[str] = [] + for output_file_name in output_file_names: + output_file_path = os.path.join(output_dir, output_file_name) + # Check if the output file exists + if os.path.exists(output_file_path): + with open(output_file_path, "r") as f: + output_files_data.append(f.read()) + else: + output_files_data.append( + "

Output file not found.

" + ) evals.append( Eval( input=input_file, - output=output_file_data, + outputs=output_files_data, ) ) diff --git a/backend/run_evals.py b/backend/run_evals.py index f26c708..a5fa878 100644 --- a/backend/run_evals.py +++ b/backend/run_evals.py @@ -14,7 +14,8 @@ from evals.core import generate_code_core from evals.utils import image_to_data_url STACK = "html_tailwind" -MODEL = Llm.CLAUDE_3_SONNET +MODEL = Llm.GPT_4_TURBO_2024_04_09 +N = 1 # Number of outputs to generate async def main(): @@ -28,16 +29,21 @@ async def main(): for filename in evals: filepath = os.path.join(INPUT_DIR, filename) data_url = await image_to_data_url(filepath) - task = generate_code_core(image_url=data_url, stack=STACK, model=MODEL) - tasks.append(task) + for _ in range(N): # Generate N tasks for each input + task = generate_code_core(image_url=data_url, stack=STACK, model=MODEL) + tasks.append(task) results = await asyncio.gather(*tasks) os.makedirs(OUTPUT_DIR, exist_ok=True) - for filename, content in zip(evals, results): - # File name is derived from the original filename in evals - output_filename = f"{os.path.splitext(filename)[0]}.html" + for i, content in enumerate(results): + # Calculate index for filename and output number + eval_index = i // N + output_number = i % N + filename = evals[eval_index] + # File name is derived from the original filename in evals with an added output number + output_filename = f"{os.path.splitext(filename)[0]}_{output_number}.html" output_filepath = os.path.join(OUTPUT_DIR, output_filename) with open(output_filepath, "w") as file: file.write(content) diff --git a/frontend/src/components/evals/EvalsPage.tsx b/frontend/src/components/evals/EvalsPage.tsx index 6e76a0d..6d2adbd 100644 --- a/frontend/src/components/evals/EvalsPage.tsx +++ b/frontend/src/components/evals/EvalsPage.tsx @@ -4,7 +4,7 @@ import RatingPicker from "./RatingPicker"; interface Eval { input: string; - output: string; + outputs: string[]; } function EvalsPage() { @@ -38,18 +38,22 @@ function EvalsPage() {
{evals.map((e, index) => (
-
+

{index}

+
+ {/* Update w if N changes to a fixed number like w-[600px] */}
- -
-
- {/* Put output into an iframe */} - + {`Input
+ {e.outputs.map((output, outputIndex) => ( +
+ {/* Put output into an iframe */} + +
+ ))}
Date: Mon, 13 May 2024 15:24:47 -0400 Subject: [PATCH 2/3] support GPT-4o --- backend/llm.py | 7 ++++++- backend/routes/generate_code.py | 3 ++- backend/run_evals.py | 4 ++-- frontend/src/App.tsx | 2 +- frontend/src/lib/models.ts | 3 +++ 5 files changed, 14 insertions(+), 5 deletions(-) diff --git a/backend/llm.py b/backend/llm.py index 3d653b2..e541046 100644 --- a/backend/llm.py +++ b/backend/llm.py @@ -13,6 +13,7 @@ from utils import pprint_prompt class Llm(Enum): GPT_4_VISION = "gpt-4-vision-preview" GPT_4_TURBO_2024_04_09 = "gpt-4-turbo-2024-04-09" + GPT_4O_2024_05_13 = "gpt-4o-2024-05-13" CLAUDE_3_SONNET = "claude-3-sonnet-20240229" CLAUDE_3_OPUS = "claude-3-opus-20240229" CLAUDE_3_HAIKU = "claude-3-haiku-20240307" @@ -47,7 +48,11 @@ async def stream_openai_response( } # Add 'max_tokens' only if the model is a GPT4 vision or Turbo model - if model == Llm.GPT_4_VISION or model == Llm.GPT_4_TURBO_2024_04_09: + if ( + model == Llm.GPT_4_VISION + or model == Llm.GPT_4_TURBO_2024_04_09 + or model == Llm.GPT_4O_2024_05_13 + ): params["max_tokens"] = 4096 stream = await client.chat.completions.create(**params) # type: ignore diff --git a/backend/routes/generate_code.py b/backend/routes/generate_code.py index fa5c7a5..e7186fc 100644 --- a/backend/routes/generate_code.py +++ b/backend/routes/generate_code.py @@ -85,7 +85,7 @@ async def stream_code(websocket: WebSocket): # Read the model from the request. Fall back to default if not provided. code_generation_model_str = params.get( - "codeGenerationModel", Llm.GPT_4_VISION.value + "codeGenerationModel", Llm.GPT_4O_2024_05_13.value ) try: code_generation_model = convert_frontend_str_to_llm(code_generation_model_str) @@ -112,6 +112,7 @@ async def stream_code(websocket: WebSocket): if not openai_api_key and ( code_generation_model == Llm.GPT_4_VISION or code_generation_model == Llm.GPT_4_TURBO_2024_04_09 + or code_generation_model == Llm.GPT_4O_2024_05_13 ): print("OpenAI API key not found") await throw_error( diff --git a/backend/run_evals.py b/backend/run_evals.py index a5fa878..bbf355a 100644 --- a/backend/run_evals.py +++ b/backend/run_evals.py @@ -13,8 +13,8 @@ from evals.config import EVALS_DIR from evals.core import generate_code_core from evals.utils import image_to_data_url -STACK = "html_tailwind" -MODEL = Llm.GPT_4_TURBO_2024_04_09 +STACK = "ionic_tailwind" +MODEL = Llm.GPT_4O_2024_05_13 N = 1 # Number of outputs to generate diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index eef3a11..33c02ca 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -63,7 +63,7 @@ function App() { isImageGenerationEnabled: true, editorTheme: EditorTheme.COBALT, generatedCodeConfig: Stack.HTML_TAILWIND, - codeGenerationModel: CodeGenerationModel.GPT_4_TURBO_2024_04_09, + codeGenerationModel: CodeGenerationModel.GPT_4O_2024_05_13, // Only relevant for hosted version isTermOfServiceAccepted: false, }, diff --git a/frontend/src/lib/models.ts b/frontend/src/lib/models.ts index ab82e23..970e63b 100644 --- a/frontend/src/lib/models.ts +++ b/frontend/src/lib/models.ts @@ -1,5 +1,7 @@ // Keep in sync with backend (llm.py) +// Order here matches dropdown order export enum CodeGenerationModel { + GPT_4O_2024_05_13 = "gpt-4o-2024-05-13", GPT_4_TURBO_2024_04_09 = "gpt-4-turbo-2024-04-09", GPT_4_VISION = "gpt_4_vision", CLAUDE_3_SONNET = "claude_3_sonnet", @@ -9,6 +11,7 @@ export enum CodeGenerationModel { export const CODE_GENERATION_MODEL_DESCRIPTIONS: { [key in CodeGenerationModel]: { name: string; inBeta: boolean }; } = { + "gpt-4o-2024-05-13": { name: "GPT-4O 🌟", inBeta: false }, "gpt-4-turbo-2024-04-09": { name: "GPT-4 Turbo (Apr 2024)", inBeta: false }, gpt_4_vision: { name: "GPT-4 Vision (Nov 2023)", inBeta: false }, claude_3_sonnet: { name: "Claude 3 Sonnet", inBeta: false }, From 43214bbbf894106b26e362602b0a7ca4c4e292ac Mon Sep 17 00:00:00 2001 From: Abi Raja Date: Mon, 13 May 2024 15:26:12 -0400 Subject: [PATCH 3/3] add another unit test for new model --- backend/test_llm.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/backend/test_llm.py b/backend/test_llm.py index ec005a3..aeb02ab 100644 --- a/backend/test_llm.py +++ b/backend/test_llm.py @@ -24,6 +24,11 @@ class TestConvertFrontendStrToLlm(unittest.TestCase): Llm.GPT_4_TURBO_2024_04_09, "Should convert 'gpt-4-turbo-2024-04-09' to Llm.GPT_4_TURBO_2024_04_09", ) + self.assertEqual( + convert_frontend_str_to_llm("gpt-4o-2024-05-13"), + Llm.GPT_4O_2024_05_13, + "Should convert 'gpt-4o-2024-05-13' to Llm.GPT_4O_2024_05_13", + ) def test_convert_invalid_string_raises_exception(self): with self.assertRaises(ValueError):