improve evaluation docs and the way the model is passed into the evaluation script

This commit is contained in:
Abi Raja 2024-04-11 10:52:25 -04:00
parent 6587b626c5
commit bb642b320e
3 changed files with 10 additions and 8 deletions

View File

@ -5,8 +5,8 @@ Evaluation dataset consists of 16 screenshots. A Python script for running scree
### Running evals ### Running evals
- Input screenshots should be located at `backend/evals_data/inputs` and the outputs will be `backend/evals_data/outputs`. If you want to modify this, modify `EVALS_DIR` in `backend/evals/config.py`. You can download the input screenshot dataset here: TODO. - Input screenshots should be located at `backend/evals_data/inputs` and the outputs will be `backend/evals_data/outputs`. If you want to modify this, modify `EVALS_DIR` in `backend/evals/config.py`. You can download the input screenshot dataset here: TODO.
- Set a stack (`STACK` var) in `backend/run_evals.py` - Set a stack and model (`STACK` var, `MODEL` var) in `backend/run_evals.py`
- Run `python backend/run_evals.py` - this runs the screenshot-to-code on the input dataset in parallel but it will still take a few minutes to complete. - Run `OPENAI_API_KEY=sk-... python run_evals.py` - this runs the screenshot-to-code on the input dataset in parallel but it will still take a few minutes to complete.
- Once the script is done, you can find the outputs in `backend/evals_data/outputs`. - Once the script is done, you can find the outputs in `backend/evals_data/outputs`.
### Rating evals ### Rating evals

View File

@ -1,14 +1,12 @@
import os import os
from config import ANTHROPIC_API_KEY from config import ANTHROPIC_API_KEY
from llm import stream_claude_response, stream_openai_response from llm import Llm, stream_claude_response, stream_openai_response
from prompts import assemble_prompt from prompts import assemble_prompt
from prompts.types import Stack from prompts.types import Stack
async def generate_code_core(image_url: str, stack: Stack) -> str: async def generate_code_core(image_url: str, stack: Stack, model: Llm) -> str:
model = "CLAUDE"
prompt_messages = assemble_prompt(image_url, stack) prompt_messages = assemble_prompt(image_url, stack)
openai_api_key = os.environ.get("OPENAI_API_KEY") openai_api_key = os.environ.get("OPENAI_API_KEY")
anthropic_api_key = ANTHROPIC_API_KEY anthropic_api_key = ANTHROPIC_API_KEY
@ -17,7 +15,7 @@ async def generate_code_core(image_url: str, stack: Stack) -> str:
async def process_chunk(content: str): async def process_chunk(content: str):
pass pass
if model == "CLAUDE": if model == Llm.CLAUDE_3_SONNET:
if not anthropic_api_key: if not anthropic_api_key:
raise Exception("Anthropic API key not found") raise Exception("Anthropic API key not found")
@ -35,6 +33,7 @@ async def generate_code_core(image_url: str, stack: Stack) -> str:
api_key=openai_api_key, api_key=openai_api_key,
base_url=openai_base_url, base_url=openai_base_url,
callback=lambda x: process_chunk(x), callback=lambda x: process_chunk(x),
model=model,
) )
return completion return completion

View File

@ -1,6 +1,8 @@
# Load environment variables first # Load environment variables first
from dotenv import load_dotenv from dotenv import load_dotenv
from llm import Llm
load_dotenv() load_dotenv()
import os import os
@ -12,6 +14,7 @@ from evals.core import generate_code_core
from evals.utils import image_to_data_url from evals.utils import image_to_data_url
STACK = "html_tailwind" STACK = "html_tailwind"
MODEL = Llm.CLAUDE_3_SONNET
async def main(): async def main():
@ -25,7 +28,7 @@ async def main():
for filename in evals: for filename in evals:
filepath = os.path.join(INPUT_DIR, filename) filepath = os.path.join(INPUT_DIR, filename)
data_url = await image_to_data_url(filepath) data_url = await image_to_data_url(filepath)
task = generate_code_core(data_url, STACK) task = generate_code_core(image_url=data_url, stack=STACK, model=MODEL)
tasks.append(task) tasks.append(task)
results = await asyncio.gather(*tasks) results = await asyncio.gather(*tasks)