diff --git a/Evaluation.md b/Evaluation.md index 5fd5da8..e937b78 100644 --- a/Evaluation.md +++ b/Evaluation.md @@ -5,8 +5,8 @@ Evaluation dataset consists of 16 screenshots. A Python script for running scree ### Running evals - Input screenshots should be located at `backend/evals_data/inputs` and the outputs will be `backend/evals_data/outputs`. If you want to modify this, modify `EVALS_DIR` in `backend/evals/config.py`. You can download the input screenshot dataset here: TODO. -- Set a stack (`STACK` var) in `backend/run_evals.py` -- Run `python backend/run_evals.py` - this runs the screenshot-to-code on the input dataset in parallel but it will still take a few minutes to complete. +- Set a stack and model (`STACK` var, `MODEL` var) in `backend/run_evals.py` +- Run `OPENAI_API_KEY=sk-... python run_evals.py` - this runs the screenshot-to-code on the input dataset in parallel but it will still take a few minutes to complete. - Once the script is done, you can find the outputs in `backend/evals_data/outputs`. ### Rating evals diff --git a/backend/evals/core.py b/backend/evals/core.py index 3438a7d..5e05362 100644 --- a/backend/evals/core.py +++ b/backend/evals/core.py @@ -1,14 +1,12 @@ import os from config import ANTHROPIC_API_KEY -from llm import stream_claude_response, stream_openai_response +from llm import Llm, stream_claude_response, stream_openai_response from prompts import assemble_prompt from prompts.types import Stack -async def generate_code_core(image_url: str, stack: Stack) -> str: - model = "CLAUDE" - +async def generate_code_core(image_url: str, stack: Stack, model: Llm) -> str: prompt_messages = assemble_prompt(image_url, stack) openai_api_key = os.environ.get("OPENAI_API_KEY") anthropic_api_key = ANTHROPIC_API_KEY @@ -17,7 +15,7 @@ async def generate_code_core(image_url: str, stack: Stack) -> str: async def process_chunk(content: str): pass - if model == "CLAUDE": + if model == Llm.CLAUDE_3_SONNET: if not anthropic_api_key: raise Exception("Anthropic API key not found") @@ -35,6 +33,7 @@ async def generate_code_core(image_url: str, stack: Stack) -> str: api_key=openai_api_key, base_url=openai_base_url, callback=lambda x: process_chunk(x), + model=model, ) return completion diff --git a/backend/run_evals.py b/backend/run_evals.py index a5cfefb..f26c708 100644 --- a/backend/run_evals.py +++ b/backend/run_evals.py @@ -1,6 +1,8 @@ # Load environment variables first from dotenv import load_dotenv +from llm import Llm + load_dotenv() import os @@ -12,6 +14,7 @@ from evals.core import generate_code_core from evals.utils import image_to_data_url STACK = "html_tailwind" +MODEL = Llm.CLAUDE_3_SONNET async def main(): @@ -25,7 +28,7 @@ async def main(): for filename in evals: filepath = os.path.join(INPUT_DIR, filename) data_url = await image_to_data_url(filepath) - task = generate_code_core(data_url, STACK) + task = generate_code_core(image_url=data_url, stack=STACK, model=MODEL) tasks.append(task) results = await asyncio.gather(*tasks)