improve evaluation docs and the way the model is passed into the evaluation script
This commit is contained in:
parent
6587b626c5
commit
bb642b320e
@ -5,8 +5,8 @@ Evaluation dataset consists of 16 screenshots. A Python script for running scree
|
|||||||
### Running evals
|
### Running evals
|
||||||
|
|
||||||
- Input screenshots should be located at `backend/evals_data/inputs` and the outputs will be `backend/evals_data/outputs`. If you want to modify this, modify `EVALS_DIR` in `backend/evals/config.py`. You can download the input screenshot dataset here: TODO.
|
- Input screenshots should be located at `backend/evals_data/inputs` and the outputs will be `backend/evals_data/outputs`. If you want to modify this, modify `EVALS_DIR` in `backend/evals/config.py`. You can download the input screenshot dataset here: TODO.
|
||||||
- Set a stack (`STACK` var) in `backend/run_evals.py`
|
- Set a stack and model (`STACK` var, `MODEL` var) in `backend/run_evals.py`
|
||||||
- Run `python backend/run_evals.py` - this runs the screenshot-to-code on the input dataset in parallel but it will still take a few minutes to complete.
|
- Run `OPENAI_API_KEY=sk-... python run_evals.py` - this runs the screenshot-to-code on the input dataset in parallel but it will still take a few minutes to complete.
|
||||||
- Once the script is done, you can find the outputs in `backend/evals_data/outputs`.
|
- Once the script is done, you can find the outputs in `backend/evals_data/outputs`.
|
||||||
|
|
||||||
### Rating evals
|
### Rating evals
|
||||||
|
|||||||
@ -1,14 +1,12 @@
|
|||||||
import os
|
import os
|
||||||
from config import ANTHROPIC_API_KEY
|
from config import ANTHROPIC_API_KEY
|
||||||
|
|
||||||
from llm import stream_claude_response, stream_openai_response
|
from llm import Llm, stream_claude_response, stream_openai_response
|
||||||
from prompts import assemble_prompt
|
from prompts import assemble_prompt
|
||||||
from prompts.types import Stack
|
from prompts.types import Stack
|
||||||
|
|
||||||
|
|
||||||
async def generate_code_core(image_url: str, stack: Stack) -> str:
|
async def generate_code_core(image_url: str, stack: Stack, model: Llm) -> str:
|
||||||
model = "CLAUDE"
|
|
||||||
|
|
||||||
prompt_messages = assemble_prompt(image_url, stack)
|
prompt_messages = assemble_prompt(image_url, stack)
|
||||||
openai_api_key = os.environ.get("OPENAI_API_KEY")
|
openai_api_key = os.environ.get("OPENAI_API_KEY")
|
||||||
anthropic_api_key = ANTHROPIC_API_KEY
|
anthropic_api_key = ANTHROPIC_API_KEY
|
||||||
@ -17,7 +15,7 @@ async def generate_code_core(image_url: str, stack: Stack) -> str:
|
|||||||
async def process_chunk(content: str):
|
async def process_chunk(content: str):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if model == "CLAUDE":
|
if model == Llm.CLAUDE_3_SONNET:
|
||||||
if not anthropic_api_key:
|
if not anthropic_api_key:
|
||||||
raise Exception("Anthropic API key not found")
|
raise Exception("Anthropic API key not found")
|
||||||
|
|
||||||
@ -35,6 +33,7 @@ async def generate_code_core(image_url: str, stack: Stack) -> str:
|
|||||||
api_key=openai_api_key,
|
api_key=openai_api_key,
|
||||||
base_url=openai_base_url,
|
base_url=openai_base_url,
|
||||||
callback=lambda x: process_chunk(x),
|
callback=lambda x: process_chunk(x),
|
||||||
|
model=model,
|
||||||
)
|
)
|
||||||
|
|
||||||
return completion
|
return completion
|
||||||
|
|||||||
@ -1,6 +1,8 @@
|
|||||||
# Load environment variables first
|
# Load environment variables first
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
from llm import Llm
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@ -12,6 +14,7 @@ from evals.core import generate_code_core
|
|||||||
from evals.utils import image_to_data_url
|
from evals.utils import image_to_data_url
|
||||||
|
|
||||||
STACK = "html_tailwind"
|
STACK = "html_tailwind"
|
||||||
|
MODEL = Llm.CLAUDE_3_SONNET
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
@ -25,7 +28,7 @@ async def main():
|
|||||||
for filename in evals:
|
for filename in evals:
|
||||||
filepath = os.path.join(INPUT_DIR, filename)
|
filepath = os.path.join(INPUT_DIR, filename)
|
||||||
data_url = await image_to_data_url(filepath)
|
data_url = await image_to_data_url(filepath)
|
||||||
task = generate_code_core(data_url, STACK)
|
task = generate_code_core(image_url=data_url, stack=STACK, model=MODEL)
|
||||||
tasks.append(task)
|
tasks.append(task)
|
||||||
|
|
||||||
results = await asyncio.gather(*tasks)
|
results = await asyncio.gather(*tasks)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user