improve evaluation docs and the way the model is passed into the evaluation script
This commit is contained in:
parent
6587b626c5
commit
bb642b320e
@ -5,8 +5,8 @@ Evaluation dataset consists of 16 screenshots. A Python script for running scree
|
||||
### Running evals
|
||||
|
||||
- Input screenshots should be located at `backend/evals_data/inputs` and the outputs will be `backend/evals_data/outputs`. If you want to modify this, modify `EVALS_DIR` in `backend/evals/config.py`. You can download the input screenshot dataset here: TODO.
|
||||
- Set a stack (`STACK` var) in `backend/run_evals.py`
|
||||
- Run `python backend/run_evals.py` - this runs the screenshot-to-code on the input dataset in parallel but it will still take a few minutes to complete.
|
||||
- Set a stack and model (`STACK` var, `MODEL` var) in `backend/run_evals.py`
|
||||
- Run `OPENAI_API_KEY=sk-... python run_evals.py` - this runs the screenshot-to-code on the input dataset in parallel but it will still take a few minutes to complete.
|
||||
- Once the script is done, you can find the outputs in `backend/evals_data/outputs`.
|
||||
|
||||
### Rating evals
|
||||
|
||||
@ -1,14 +1,12 @@
|
||||
import os
|
||||
from config import ANTHROPIC_API_KEY
|
||||
|
||||
from llm import stream_claude_response, stream_openai_response
|
||||
from llm import Llm, stream_claude_response, stream_openai_response
|
||||
from prompts import assemble_prompt
|
||||
from prompts.types import Stack
|
||||
|
||||
|
||||
async def generate_code_core(image_url: str, stack: Stack) -> str:
|
||||
model = "CLAUDE"
|
||||
|
||||
async def generate_code_core(image_url: str, stack: Stack, model: Llm) -> str:
|
||||
prompt_messages = assemble_prompt(image_url, stack)
|
||||
openai_api_key = os.environ.get("OPENAI_API_KEY")
|
||||
anthropic_api_key = ANTHROPIC_API_KEY
|
||||
@ -17,7 +15,7 @@ async def generate_code_core(image_url: str, stack: Stack) -> str:
|
||||
async def process_chunk(content: str):
|
||||
pass
|
||||
|
||||
if model == "CLAUDE":
|
||||
if model == Llm.CLAUDE_3_SONNET:
|
||||
if not anthropic_api_key:
|
||||
raise Exception("Anthropic API key not found")
|
||||
|
||||
@ -35,6 +33,7 @@ async def generate_code_core(image_url: str, stack: Stack) -> str:
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_base_url,
|
||||
callback=lambda x: process_chunk(x),
|
||||
model=model,
|
||||
)
|
||||
|
||||
return completion
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
# Load environment variables first
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from llm import Llm
|
||||
|
||||
load_dotenv()
|
||||
|
||||
import os
|
||||
@ -12,6 +14,7 @@ from evals.core import generate_code_core
|
||||
from evals.utils import image_to_data_url
|
||||
|
||||
STACK = "html_tailwind"
|
||||
MODEL = Llm.CLAUDE_3_SONNET
|
||||
|
||||
|
||||
async def main():
|
||||
@ -25,7 +28,7 @@ async def main():
|
||||
for filename in evals:
|
||||
filepath = os.path.join(INPUT_DIR, filename)
|
||||
data_url = await image_to_data_url(filepath)
|
||||
task = generate_code_core(data_url, STACK)
|
||||
task = generate_code_core(image_url=data_url, stack=STACK, model=MODEL)
|
||||
tasks.append(task)
|
||||
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user