initial version of script
This commit is contained in:
parent
f2cb1a4dc3
commit
c2f230a8c9
4
backend/.gitignore
vendored
4
backend/.gitignore
vendored
@ -154,3 +154,7 @@ cython_debug/
|
||||
|
||||
# Temporary eval output
|
||||
evals_data
|
||||
|
||||
|
||||
# Temporary video evals (Remove before merge)
|
||||
video_evals
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
from typing import Awaitable, Callable, List, cast
|
||||
from typing import Any, Awaitable, Callable, List, cast
|
||||
from anthropic import AsyncAnthropic
|
||||
from openai import AsyncOpenAI
|
||||
from openai.types.chat import ChatCompletionMessageParam, ChatCompletionChunk
|
||||
@ -46,6 +46,7 @@ async def stream_openai_response(
|
||||
return full_response
|
||||
|
||||
|
||||
# TODO: Have a seperate function that translates OpenAI messages to Claude messages
|
||||
async def stream_claude_response(
|
||||
messages: List[ChatCompletionMessageParam],
|
||||
api_key: str,
|
||||
@ -99,3 +100,47 @@ async def stream_claude_response(
|
||||
# Return final message
|
||||
response = await stream.get_final_message()
|
||||
return response.content[0].text
|
||||
|
||||
|
||||
async def stream_claude_response_native(
|
||||
system_prompt: str,
|
||||
messages: list[Any],
|
||||
api_key: str,
|
||||
callback: Callable[[str], Awaitable[None]],
|
||||
include_thinking: bool = False,
|
||||
model: str = MODEL_CLAUDE_OPUS,
|
||||
) -> str:
|
||||
|
||||
client = AsyncAnthropic(api_key=api_key)
|
||||
|
||||
# Base parameters
|
||||
max_tokens = 4096
|
||||
temperature = 0.0
|
||||
|
||||
# Stream Claude response
|
||||
|
||||
# Set up message depending on whether we have a <thinking> prefix
|
||||
messages = (
|
||||
messages + [{"role": "assistant", "content": "<thinking>"}]
|
||||
if include_thinking
|
||||
else messages
|
||||
)
|
||||
|
||||
async with client.messages.stream(
|
||||
model=model,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
system=system_prompt,
|
||||
messages=messages, # type: ignore
|
||||
) as stream:
|
||||
async for text in stream.text_stream:
|
||||
await callback(text)
|
||||
|
||||
# Return final message
|
||||
response = await stream.get_final_message()
|
||||
|
||||
print(
|
||||
f"Token usage: Input Tokens: {response.usage.input_tokens}, Output Tokens: {response.usage.output_tokens}"
|
||||
)
|
||||
|
||||
return response.content[0].text
|
||||
|
||||
@ -4,6 +4,53 @@
|
||||
# https://docs.anthropic.com/claude/docs/prompt-engineering
|
||||
# https://github.com/anthropics/anthropic-cookbook/blob/main/multimodal/best_practices_for_vision.ipynb
|
||||
|
||||
VIDEO_PROMPT = """
|
||||
You are an expert at building single page, funtional apps using HTML, Jquery and Tailwind CSS.
|
||||
You also have perfect vision and pay great attention to detail.
|
||||
|
||||
You will be given screenshots in order at consistent intervals from a video of a user interacting with a web app. You need to re-create the same app exactly such that the same user interactions will produce the same results in the app you build.
|
||||
|
||||
- Make sure the app looks exactly like the screenshot.
|
||||
- Pay close attention to background color, text color, font size, font family,
|
||||
padding, margin, border, etc. Match the colors and sizes exactly.
|
||||
- For images, use placeholder images from https://placehold.co and include a detailed description of the image in the alt text so that an image generation AI can generate the image later.
|
||||
- If some fuctionality requires a backend call, just mock the data instead.
|
||||
- MAKE THE APP FUNCTIONAL using Javascript. Allow the user to interact with the app and get the same behavior as the video.
|
||||
|
||||
In terms of libraries,
|
||||
|
||||
- Use this script to include Tailwind: <script src="https://cdn.tailwindcss.com"></script>
|
||||
- You can use Google Fonts
|
||||
- Font Awesome for icons: <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css"></link>
|
||||
- Use jQuery: <script src="https://code.jquery.com/jquery-3.7.1.min.js"></script>
|
||||
|
||||
Before generating the code for the app, think step-by-step: first, about the user flow depicated in the video and then about you how would you build it and how you would structure the code. Do the thinking within <thinking></thinking> tags. Then, provide your code within <html></html> tags.
|
||||
"""
|
||||
|
||||
VIDEO_PROMPT_ALPINE_JS = """
|
||||
You are an expert at building single page, funtional apps using HTML, Alpine.js and Tailwind CSS.
|
||||
You also have perfect vision and pay great attention to detail.
|
||||
|
||||
You will be given screenshots in order at consistent intervals from a video of a user interacting with a web app. You need to re-create the same app exactly such that the same user interactions will produce the same results in the app you build.
|
||||
|
||||
- Make sure the app looks exactly like the screenshot.
|
||||
- Pay close attention to background color, text color, font size, font family,
|
||||
padding, margin, border, etc. Match the colors and sizes exactly.
|
||||
- For images, use placeholder images from https://placehold.co and include a detailed description of the image in the alt text so that an image generation AI can generate the image later.
|
||||
- If some fuctionality requires a backend call, just mock the data instead.
|
||||
- MAKE THE APP FUNCTIONAL using Javascript. Allow the user to interact with the app and get the same behavior as the video.
|
||||
|
||||
In terms of libraries,
|
||||
|
||||
- Use this script to include Tailwind: <script src="https://cdn.tailwindcss.com"></script>
|
||||
- You can use Google Fonts
|
||||
- Font Awesome for icons: <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css"></link>
|
||||
- Use Alpine.js: <script defer src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"></script>
|
||||
|
||||
Before generating the code for the app, think step-by-step: first, about the user flow depicated in the video and then about you how would you build it and how you would structure the code. Do the thinking within <thinking></thinking> tags. Then, provide your code within <html></html> tags.
|
||||
"""
|
||||
|
||||
|
||||
HTML_TAILWIND_CLAUDE_SYSTEM_PROMPT = """
|
||||
You have perfect vision and pay great attention to detail which makes you an expert at building single page apps using Tailwind, HTML and JS.
|
||||
You take screenshots of a reference web page from the user, and then build single page apps
|
||||
@ -31,3 +78,37 @@ In terms of libraries,
|
||||
Return only the full code in <html></html> tags.
|
||||
Do not include markdown "```" or "```html" at the start or end.
|
||||
"""
|
||||
|
||||
#
|
||||
|
||||
REACT_TAILWIND_CLAUDE_SYSTEM_PROMPT = """
|
||||
You have perfect vision and pay great attention to detail which makes you an expert at building single page apps using React/Tailwind.
|
||||
You take screenshots of a reference web page from the user, and then build single page apps
|
||||
using React and Tailwind CSS.
|
||||
You might also be given a screenshot (The second image) of a web page that you have already built, and asked to
|
||||
update it to look more like the reference image(The first image).
|
||||
|
||||
- Make sure the app looks exactly like the screenshot.
|
||||
- Do not leave out smaller UI elements. Make sure to include every single thing in the screenshot.
|
||||
- Pay close attention to background color, text color, font size, font family,
|
||||
padding, margin, border, etc. Match the colors and sizes exactly.
|
||||
- In particular, pay attention to background color and overall color scheme.
|
||||
- Use the exact text from the screenshot.
|
||||
- Do not add comments in the code such as "<!-- Add other navigation links as needed -->" and "<!-- ... other news items ... -->" in place of writing the full code. WRITE THE FULL CODE.
|
||||
- Make sure to always get the layout right (if things are arranged in a row in the screenshot, they should be in a row in the app as well)
|
||||
- CREATE REUSABLE COMPONENTS FOR REPEATING ELEMENTS. For example, if there are 15 similar items in the screenshot, your code should include a reusable component that generates these items. and use loops to instantiate these components as needed.
|
||||
- For images, use placeholder images from https://placehold.co and include a detailed description of the image in the alt text so that an image generation AI can generate the image later.
|
||||
|
||||
In terms of libraries,
|
||||
|
||||
- Use these script to include React so that it can run on a standalone page:
|
||||
<script src="https://unpkg.com/react/umd/react.development.js"></script>
|
||||
<script src="https://unpkg.com/react-dom/umd/react-dom.development.js"></script>
|
||||
<script src="https://unpkg.com/@babel/standalone/babel.js"></script>
|
||||
- Use this script to include Tailwind: <script src="https://cdn.tailwindcss.com"></script>
|
||||
- You can use Google Fonts
|
||||
- Font Awesome for icons: <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css"></link>
|
||||
|
||||
Return only the full code in <html></html> tags.
|
||||
Do not include markdown "```" or "```html" at the start or end.
|
||||
"""
|
||||
|
||||
226
backend/video_to_app.py
Normal file
226
backend/video_to_app.py
Normal file
@ -0,0 +1,226 @@
|
||||
# Load environment variables first
|
||||
import base64
|
||||
import shutil
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
import time
|
||||
import subprocess
|
||||
import os
|
||||
from typing import Union
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
from prompts.claude_prompts import VIDEO_PROMPT, VIDEO_PROMPT_ALPINE_JS
|
||||
from utils import pprint_prompt
|
||||
from config import ANTHROPIC_API_KEY
|
||||
from llm import (
|
||||
MODEL_CLAUDE_OPUS,
|
||||
# MODEL_CLAUDE_SONNET,
|
||||
stream_claude_response_native,
|
||||
)
|
||||
|
||||
STACK = "html_tailwind"
|
||||
|
||||
VIDEO_DIR = "./video_evals/videos"
|
||||
SCREENSHOTS_DIR = "./video_evals/screenshots"
|
||||
OUTPUTS_DIR = "./video_evals/outputs"
|
||||
|
||||
|
||||
async def main():
|
||||
|
||||
video_filename = "mortgage-calculator.mov"
|
||||
screenshot_interval = 850
|
||||
is_followup = False
|
||||
|
||||
# Get previous HTML
|
||||
previous_html = ""
|
||||
if is_followup:
|
||||
previous_html_file = max(
|
||||
[
|
||||
os.path.join(OUTPUTS_DIR, f)
|
||||
for f in os.listdir(OUTPUTS_DIR)
|
||||
if f.endswith(".html")
|
||||
],
|
||||
key=os.path.getctime,
|
||||
)
|
||||
print(previous_html_file)
|
||||
with open(previous_html_file, "r") as file:
|
||||
previous_html = file.read()
|
||||
|
||||
if not ANTHROPIC_API_KEY:
|
||||
raise ValueError("ANTHROPIC_API_KEY is not set")
|
||||
|
||||
# Create the SCREENSHOTS_DIR if it doesn't exist
|
||||
if not os.path.exists(SCREENSHOTS_DIR):
|
||||
os.makedirs(SCREENSHOTS_DIR)
|
||||
|
||||
# Clear out the SCREENSHOTS_DIR before generating new screenshots
|
||||
for filename in os.listdir(SCREENSHOTS_DIR):
|
||||
file_path = os.path.join(SCREENSHOTS_DIR, filename)
|
||||
try:
|
||||
if os.path.isfile(file_path) or os.path.islink(file_path):
|
||||
os.unlink(file_path)
|
||||
elif os.path.isdir(file_path):
|
||||
shutil.rmtree(file_path)
|
||||
except Exception as e:
|
||||
print(f"Failed to delete {file_path}. Reason: {e}")
|
||||
|
||||
# Split the video into screenshots
|
||||
split_video_into_screenshots(
|
||||
os.path.join(VIDEO_DIR, video_filename), SCREENSHOTS_DIR, screenshot_interval
|
||||
)
|
||||
|
||||
# Get all the screenshots in the directory
|
||||
screenshots = [f for f in os.listdir(SCREENSHOTS_DIR) if f.endswith(".jpg")]
|
||||
|
||||
if len(screenshots) > 20:
|
||||
print(f"Too many screenshots: {len(screenshots)}")
|
||||
return
|
||||
|
||||
input_image_urls: list[str] = []
|
||||
sorted_screenshots = sorted(screenshots, key=lambda x: int(x.split(".")[0]))
|
||||
for filename in sorted_screenshots:
|
||||
filepath = os.path.join(SCREENSHOTS_DIR, filename)
|
||||
data_url = await image_to_data_url(filepath)
|
||||
print(filename)
|
||||
input_image_urls.append(data_url)
|
||||
|
||||
# Convert images to the message format for Claude
|
||||
content_messages: list[dict[str, Union[dict[str, str], str]]] = []
|
||||
for url in input_image_urls:
|
||||
media_type = url.split(";")[0].split(":")[1]
|
||||
base64_data = url.split(",")[1]
|
||||
content_messages.append(
|
||||
{
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": media_type,
|
||||
"data": base64_data,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
prompt_messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": content_messages,
|
||||
},
|
||||
# {"role": "assistant", "content": SECOND_MESSAGE},
|
||||
# {"role": "user", "content": "continue"},
|
||||
]
|
||||
|
||||
if is_followup:
|
||||
prompt_messages += [
|
||||
{"role": "assistant", "content": previous_html},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "You've done a good job with a first draft. Improve this further based on the original instructions so that the app is fully functional like in the original video.",
|
||||
},
|
||||
] # type: ignore
|
||||
|
||||
async def process_chunk(content: str):
|
||||
print(content, end="", flush=True)
|
||||
|
||||
response_prefix = "<thinking>"
|
||||
|
||||
pprint_prompt(prompt_messages) # type: ignore
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
completion = await stream_claude_response_native(
|
||||
system_prompt=VIDEO_PROMPT,
|
||||
messages=prompt_messages,
|
||||
api_key=ANTHROPIC_API_KEY,
|
||||
callback=lambda x: process_chunk(x),
|
||||
model=MODEL_CLAUDE_OPUS,
|
||||
include_thinking=True,
|
||||
)
|
||||
|
||||
end_time = time.time()
|
||||
|
||||
# Prepend the response prefix to the completion
|
||||
completion = response_prefix + completion
|
||||
|
||||
# Extract the outputs
|
||||
html_content = extract_tag_content("html", completion)
|
||||
thinking = extract_tag_content("thinking", completion)
|
||||
|
||||
print(thinking)
|
||||
print(f"Operation took {end_time - start_time} seconds")
|
||||
|
||||
os.makedirs(OUTPUTS_DIR, exist_ok=True)
|
||||
|
||||
# Generate a unique filename based on the current time
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
filename = f"video_test_output_{timestamp}.html"
|
||||
output_path = os.path.join(OUTPUTS_DIR, filename)
|
||||
|
||||
# Write the HTML content to the file
|
||||
with open(output_path, "w") as file:
|
||||
file.write(html_content)
|
||||
|
||||
# Show a notification
|
||||
subprocess.run(["osascript", "-e", 'display notification "Coding Complete"'])
|
||||
|
||||
|
||||
# Extract HTML content from the completion string
|
||||
def extract_tag_content(tag: str, text: str) -> str:
|
||||
"""
|
||||
Extracts content for a given tag from the provided text.
|
||||
|
||||
:param tag: The tag to search for.
|
||||
:param text: The text to search within.
|
||||
:return: The content found within the tag, if any.
|
||||
"""
|
||||
tag_start = f"<{tag}>"
|
||||
tag_end = f"</{tag}>"
|
||||
start_idx = text.find(tag_start)
|
||||
end_idx = text.find(tag_end, start_idx)
|
||||
if start_idx != -1 and end_idx != -1:
|
||||
return text[start_idx : end_idx + len(tag_end)]
|
||||
return ""
|
||||
|
||||
|
||||
def split_video_into_screenshots(video_path: str, output_dir: str, interval: int):
|
||||
# Create the output directory if it doesn't exist
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Calculate the number of zeros needed for padding
|
||||
# duration = float(
|
||||
# subprocess.check_output(
|
||||
# [
|
||||
# "ffprobe",
|
||||
# "-v",
|
||||
# "error",
|
||||
# "-show_entries",
|
||||
# "format=duration",
|
||||
# "-of",
|
||||
# "default=noprint_wrappers=1:nokey=1",
|
||||
# video_path,
|
||||
# ]
|
||||
# )
|
||||
# )
|
||||
|
||||
# Run the ffmpeg command to extract screenshots
|
||||
subprocess.call(
|
||||
[
|
||||
"ffmpeg",
|
||||
"-i",
|
||||
video_path,
|
||||
"-vf",
|
||||
f"fps=1/{interval/1000}",
|
||||
f"{output_dir}/%d.jpg",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# TODO: Don't hard-code the media type
|
||||
async def image_to_data_url(filepath: str):
|
||||
with open(filepath, "rb") as image_file:
|
||||
encoded_string = base64.b64encode(image_file.read()).decode()
|
||||
return f"data:image/jpeg;base64,{encoded_string}"
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
Loading…
Reference in New Issue
Block a user