improve evaluation docs and the way the model is passed into the evaluation script

2024-04-11 10:52:25 -04:00 · 2024-04-11 10:52:25 -04:00 · bb642b320e
commit bb642b320e
parent 6587b626c5
3 changed files with 10 additions and 8 deletions
--- a/Evaluation.md
+++ b/Evaluation.md
@ -5,8 +5,8 @@ Evaluation dataset consists of 16 screenshots. A Python script for running scree
 ### Running evals

 - Input screenshots should be located at `backend/evals_data/inputs` and the outputs will be `backend/evals_data/outputs`. If you want to modify this, modify `EVALS_DIR` in `backend/evals/config.py`. You can download the input screenshot dataset here: TODO.
- Set a stack (`STACK` var) in `backend/run_evals.py`
- Run `python backend/run_evals.py` - this runs the screenshot-to-code on the input dataset in parallel but it will still take a few minutes to complete.
+- Set a stack and model (`STACK` var, `MODEL` var) in `backend/run_evals.py`
+- Run `OPENAI_API_KEY=sk-... python run_evals.py` - this runs the screenshot-to-code on the input dataset in parallel but it will still take a few minutes to complete.
 - Once the script is done, you can find the outputs in `backend/evals_data/outputs`.

 ### Rating evals
--- a/backend/evals/core.py
+++ b/backend/evals/core.py
@ -1,14 +1,12 @@
 import os
 from config import ANTHROPIC_API_KEY

-from llm import stream_claude_response, stream_openai_response
+from llm import Llm, stream_claude_response, stream_openai_response
 from prompts import assemble_prompt
 from prompts.types import Stack


-async def generate_code_core(image_url: str, stack: Stack) -> str:
-    model = "CLAUDE"
-
+async def generate_code_core(image_url: str, stack: Stack, model: Llm) -> str:
    prompt_messages = assemble_prompt(image_url, stack)
    openai_api_key = os.environ.get("OPENAI_API_KEY")
    anthropic_api_key = ANTHROPIC_API_KEY
@ -17,7 +15,7 @@ async def generate_code_core(image_url: str, stack: Stack) -> str:
    async def process_chunk(content: str):
        pass

-    if model == "CLAUDE":
+    if model == Llm.CLAUDE_3_SONNET:
        if not anthropic_api_key:
            raise Exception("Anthropic API key not found")

@ -35,6 +33,7 @@ async def generate_code_core(image_url: str, stack: Stack) -> str:
            api_key=openai_api_key,
            base_url=openai_base_url,
            callback=lambda x: process_chunk(x),
+            model=model,
        )

    return completion
--- a/backend/run_evals.py
+++ b/backend/run_evals.py
@ -1,6 +1,8 @@
 # Load environment variables first
 from dotenv import load_dotenv

+from llm import Llm
+
 load_dotenv()

 import os
@ -12,6 +14,7 @@ from evals.core import generate_code_core
 from evals.utils import image_to_data_url

 STACK = "html_tailwind"
+MODEL = Llm.CLAUDE_3_SONNET


 async def main():
@ -25,7 +28,7 @@ async def main():
    for filename in evals:
        filepath = os.path.join(INPUT_DIR, filename)
        data_url = await image_to_data_url(filepath)
-        task = generate_code_core(data_url, STACK)
+        task = generate_code_core(image_url=data_url, stack=STACK, model=MODEL)
        tasks.append(task)

    results = await asyncio.gather(*tasks)