initial version of script
This commit is contained in:
parent
f2cb1a4dc3
commit
c2f230a8c9
4
backend/.gitignore
vendored
4
backend/.gitignore
vendored
@ -154,3 +154,7 @@ cython_debug/
|
|||||||
|
|
||||||
# Temporary eval output
|
# Temporary eval output
|
||||||
evals_data
|
evals_data
|
||||||
|
|
||||||
|
|
||||||
|
# Temporary video evals (Remove before merge)
|
||||||
|
video_evals
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
from typing import Awaitable, Callable, List, cast
|
from typing import Any, Awaitable, Callable, List, cast
|
||||||
from anthropic import AsyncAnthropic
|
from anthropic import AsyncAnthropic
|
||||||
from openai import AsyncOpenAI
|
from openai import AsyncOpenAI
|
||||||
from openai.types.chat import ChatCompletionMessageParam, ChatCompletionChunk
|
from openai.types.chat import ChatCompletionMessageParam, ChatCompletionChunk
|
||||||
@ -46,6 +46,7 @@ async def stream_openai_response(
|
|||||||
return full_response
|
return full_response
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: Have a seperate function that translates OpenAI messages to Claude messages
|
||||||
async def stream_claude_response(
|
async def stream_claude_response(
|
||||||
messages: List[ChatCompletionMessageParam],
|
messages: List[ChatCompletionMessageParam],
|
||||||
api_key: str,
|
api_key: str,
|
||||||
@ -99,3 +100,47 @@ async def stream_claude_response(
|
|||||||
# Return final message
|
# Return final message
|
||||||
response = await stream.get_final_message()
|
response = await stream.get_final_message()
|
||||||
return response.content[0].text
|
return response.content[0].text
|
||||||
|
|
||||||
|
|
||||||
|
async def stream_claude_response_native(
|
||||||
|
system_prompt: str,
|
||||||
|
messages: list[Any],
|
||||||
|
api_key: str,
|
||||||
|
callback: Callable[[str], Awaitable[None]],
|
||||||
|
include_thinking: bool = False,
|
||||||
|
model: str = MODEL_CLAUDE_OPUS,
|
||||||
|
) -> str:
|
||||||
|
|
||||||
|
client = AsyncAnthropic(api_key=api_key)
|
||||||
|
|
||||||
|
# Base parameters
|
||||||
|
max_tokens = 4096
|
||||||
|
temperature = 0.0
|
||||||
|
|
||||||
|
# Stream Claude response
|
||||||
|
|
||||||
|
# Set up message depending on whether we have a <thinking> prefix
|
||||||
|
messages = (
|
||||||
|
messages + [{"role": "assistant", "content": "<thinking>"}]
|
||||||
|
if include_thinking
|
||||||
|
else messages
|
||||||
|
)
|
||||||
|
|
||||||
|
async with client.messages.stream(
|
||||||
|
model=model,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
temperature=temperature,
|
||||||
|
system=system_prompt,
|
||||||
|
messages=messages, # type: ignore
|
||||||
|
) as stream:
|
||||||
|
async for text in stream.text_stream:
|
||||||
|
await callback(text)
|
||||||
|
|
||||||
|
# Return final message
|
||||||
|
response = await stream.get_final_message()
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"Token usage: Input Tokens: {response.usage.input_tokens}, Output Tokens: {response.usage.output_tokens}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return response.content[0].text
|
||||||
|
|||||||
@ -4,6 +4,53 @@
|
|||||||
# https://docs.anthropic.com/claude/docs/prompt-engineering
|
# https://docs.anthropic.com/claude/docs/prompt-engineering
|
||||||
# https://github.com/anthropics/anthropic-cookbook/blob/main/multimodal/best_practices_for_vision.ipynb
|
# https://github.com/anthropics/anthropic-cookbook/blob/main/multimodal/best_practices_for_vision.ipynb
|
||||||
|
|
||||||
|
VIDEO_PROMPT = """
|
||||||
|
You are an expert at building single page, funtional apps using HTML, Jquery and Tailwind CSS.
|
||||||
|
You also have perfect vision and pay great attention to detail.
|
||||||
|
|
||||||
|
You will be given screenshots in order at consistent intervals from a video of a user interacting with a web app. You need to re-create the same app exactly such that the same user interactions will produce the same results in the app you build.
|
||||||
|
|
||||||
|
- Make sure the app looks exactly like the screenshot.
|
||||||
|
- Pay close attention to background color, text color, font size, font family,
|
||||||
|
padding, margin, border, etc. Match the colors and sizes exactly.
|
||||||
|
- For images, use placeholder images from https://placehold.co and include a detailed description of the image in the alt text so that an image generation AI can generate the image later.
|
||||||
|
- If some fuctionality requires a backend call, just mock the data instead.
|
||||||
|
- MAKE THE APP FUNCTIONAL using Javascript. Allow the user to interact with the app and get the same behavior as the video.
|
||||||
|
|
||||||
|
In terms of libraries,
|
||||||
|
|
||||||
|
- Use this script to include Tailwind: <script src="https://cdn.tailwindcss.com"></script>
|
||||||
|
- You can use Google Fonts
|
||||||
|
- Font Awesome for icons: <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css"></link>
|
||||||
|
- Use jQuery: <script src="https://code.jquery.com/jquery-3.7.1.min.js"></script>
|
||||||
|
|
||||||
|
Before generating the code for the app, think step-by-step: first, about the user flow depicated in the video and then about you how would you build it and how you would structure the code. Do the thinking within <thinking></thinking> tags. Then, provide your code within <html></html> tags.
|
||||||
|
"""
|
||||||
|
|
||||||
|
VIDEO_PROMPT_ALPINE_JS = """
|
||||||
|
You are an expert at building single page, funtional apps using HTML, Alpine.js and Tailwind CSS.
|
||||||
|
You also have perfect vision and pay great attention to detail.
|
||||||
|
|
||||||
|
You will be given screenshots in order at consistent intervals from a video of a user interacting with a web app. You need to re-create the same app exactly such that the same user interactions will produce the same results in the app you build.
|
||||||
|
|
||||||
|
- Make sure the app looks exactly like the screenshot.
|
||||||
|
- Pay close attention to background color, text color, font size, font family,
|
||||||
|
padding, margin, border, etc. Match the colors and sizes exactly.
|
||||||
|
- For images, use placeholder images from https://placehold.co and include a detailed description of the image in the alt text so that an image generation AI can generate the image later.
|
||||||
|
- If some fuctionality requires a backend call, just mock the data instead.
|
||||||
|
- MAKE THE APP FUNCTIONAL using Javascript. Allow the user to interact with the app and get the same behavior as the video.
|
||||||
|
|
||||||
|
In terms of libraries,
|
||||||
|
|
||||||
|
- Use this script to include Tailwind: <script src="https://cdn.tailwindcss.com"></script>
|
||||||
|
- You can use Google Fonts
|
||||||
|
- Font Awesome for icons: <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css"></link>
|
||||||
|
- Use Alpine.js: <script defer src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"></script>
|
||||||
|
|
||||||
|
Before generating the code for the app, think step-by-step: first, about the user flow depicated in the video and then about you how would you build it and how you would structure the code. Do the thinking within <thinking></thinking> tags. Then, provide your code within <html></html> tags.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
HTML_TAILWIND_CLAUDE_SYSTEM_PROMPT = """
|
HTML_TAILWIND_CLAUDE_SYSTEM_PROMPT = """
|
||||||
You have perfect vision and pay great attention to detail which makes you an expert at building single page apps using Tailwind, HTML and JS.
|
You have perfect vision and pay great attention to detail which makes you an expert at building single page apps using Tailwind, HTML and JS.
|
||||||
You take screenshots of a reference web page from the user, and then build single page apps
|
You take screenshots of a reference web page from the user, and then build single page apps
|
||||||
@ -31,3 +78,37 @@ In terms of libraries,
|
|||||||
Return only the full code in <html></html> tags.
|
Return only the full code in <html></html> tags.
|
||||||
Do not include markdown "```" or "```html" at the start or end.
|
Do not include markdown "```" or "```html" at the start or end.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
#
|
||||||
|
|
||||||
|
REACT_TAILWIND_CLAUDE_SYSTEM_PROMPT = """
|
||||||
|
You have perfect vision and pay great attention to detail which makes you an expert at building single page apps using React/Tailwind.
|
||||||
|
You take screenshots of a reference web page from the user, and then build single page apps
|
||||||
|
using React and Tailwind CSS.
|
||||||
|
You might also be given a screenshot (The second image) of a web page that you have already built, and asked to
|
||||||
|
update it to look more like the reference image(The first image).
|
||||||
|
|
||||||
|
- Make sure the app looks exactly like the screenshot.
|
||||||
|
- Do not leave out smaller UI elements. Make sure to include every single thing in the screenshot.
|
||||||
|
- Pay close attention to background color, text color, font size, font family,
|
||||||
|
padding, margin, border, etc. Match the colors and sizes exactly.
|
||||||
|
- In particular, pay attention to background color and overall color scheme.
|
||||||
|
- Use the exact text from the screenshot.
|
||||||
|
- Do not add comments in the code such as "<!-- Add other navigation links as needed -->" and "<!-- ... other news items ... -->" in place of writing the full code. WRITE THE FULL CODE.
|
||||||
|
- Make sure to always get the layout right (if things are arranged in a row in the screenshot, they should be in a row in the app as well)
|
||||||
|
- CREATE REUSABLE COMPONENTS FOR REPEATING ELEMENTS. For example, if there are 15 similar items in the screenshot, your code should include a reusable component that generates these items. and use loops to instantiate these components as needed.
|
||||||
|
- For images, use placeholder images from https://placehold.co and include a detailed description of the image in the alt text so that an image generation AI can generate the image later.
|
||||||
|
|
||||||
|
In terms of libraries,
|
||||||
|
|
||||||
|
- Use these script to include React so that it can run on a standalone page:
|
||||||
|
<script src="https://unpkg.com/react/umd/react.development.js"></script>
|
||||||
|
<script src="https://unpkg.com/react-dom/umd/react-dom.development.js"></script>
|
||||||
|
<script src="https://unpkg.com/@babel/standalone/babel.js"></script>
|
||||||
|
- Use this script to include Tailwind: <script src="https://cdn.tailwindcss.com"></script>
|
||||||
|
- You can use Google Fonts
|
||||||
|
- Font Awesome for icons: <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css"></link>
|
||||||
|
|
||||||
|
Return only the full code in <html></html> tags.
|
||||||
|
Do not include markdown "```" or "```html" at the start or end.
|
||||||
|
"""
|
||||||
|
|||||||
226
backend/video_to_app.py
Normal file
226
backend/video_to_app.py
Normal file
@ -0,0 +1,226 @@
|
|||||||
|
# Load environment variables first
|
||||||
|
import base64
|
||||||
|
import shutil
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
import time
|
||||||
|
import subprocess
|
||||||
|
import os
|
||||||
|
from typing import Union
|
||||||
|
import asyncio
|
||||||
|
from datetime import datetime
|
||||||
|
from prompts.claude_prompts import VIDEO_PROMPT, VIDEO_PROMPT_ALPINE_JS
|
||||||
|
from utils import pprint_prompt
|
||||||
|
from config import ANTHROPIC_API_KEY
|
||||||
|
from llm import (
|
||||||
|
MODEL_CLAUDE_OPUS,
|
||||||
|
# MODEL_CLAUDE_SONNET,
|
||||||
|
stream_claude_response_native,
|
||||||
|
)
|
||||||
|
|
||||||
|
STACK = "html_tailwind"
|
||||||
|
|
||||||
|
VIDEO_DIR = "./video_evals/videos"
|
||||||
|
SCREENSHOTS_DIR = "./video_evals/screenshots"
|
||||||
|
OUTPUTS_DIR = "./video_evals/outputs"
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
|
||||||
|
video_filename = "mortgage-calculator.mov"
|
||||||
|
screenshot_interval = 850
|
||||||
|
is_followup = False
|
||||||
|
|
||||||
|
# Get previous HTML
|
||||||
|
previous_html = ""
|
||||||
|
if is_followup:
|
||||||
|
previous_html_file = max(
|
||||||
|
[
|
||||||
|
os.path.join(OUTPUTS_DIR, f)
|
||||||
|
for f in os.listdir(OUTPUTS_DIR)
|
||||||
|
if f.endswith(".html")
|
||||||
|
],
|
||||||
|
key=os.path.getctime,
|
||||||
|
)
|
||||||
|
print(previous_html_file)
|
||||||
|
with open(previous_html_file, "r") as file:
|
||||||
|
previous_html = file.read()
|
||||||
|
|
||||||
|
if not ANTHROPIC_API_KEY:
|
||||||
|
raise ValueError("ANTHROPIC_API_KEY is not set")
|
||||||
|
|
||||||
|
# Create the SCREENSHOTS_DIR if it doesn't exist
|
||||||
|
if not os.path.exists(SCREENSHOTS_DIR):
|
||||||
|
os.makedirs(SCREENSHOTS_DIR)
|
||||||
|
|
||||||
|
# Clear out the SCREENSHOTS_DIR before generating new screenshots
|
||||||
|
for filename in os.listdir(SCREENSHOTS_DIR):
|
||||||
|
file_path = os.path.join(SCREENSHOTS_DIR, filename)
|
||||||
|
try:
|
||||||
|
if os.path.isfile(file_path) or os.path.islink(file_path):
|
||||||
|
os.unlink(file_path)
|
||||||
|
elif os.path.isdir(file_path):
|
||||||
|
shutil.rmtree(file_path)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to delete {file_path}. Reason: {e}")
|
||||||
|
|
||||||
|
# Split the video into screenshots
|
||||||
|
split_video_into_screenshots(
|
||||||
|
os.path.join(VIDEO_DIR, video_filename), SCREENSHOTS_DIR, screenshot_interval
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get all the screenshots in the directory
|
||||||
|
screenshots = [f for f in os.listdir(SCREENSHOTS_DIR) if f.endswith(".jpg")]
|
||||||
|
|
||||||
|
if len(screenshots) > 20:
|
||||||
|
print(f"Too many screenshots: {len(screenshots)}")
|
||||||
|
return
|
||||||
|
|
||||||
|
input_image_urls: list[str] = []
|
||||||
|
sorted_screenshots = sorted(screenshots, key=lambda x: int(x.split(".")[0]))
|
||||||
|
for filename in sorted_screenshots:
|
||||||
|
filepath = os.path.join(SCREENSHOTS_DIR, filename)
|
||||||
|
data_url = await image_to_data_url(filepath)
|
||||||
|
print(filename)
|
||||||
|
input_image_urls.append(data_url)
|
||||||
|
|
||||||
|
# Convert images to the message format for Claude
|
||||||
|
content_messages: list[dict[str, Union[dict[str, str], str]]] = []
|
||||||
|
for url in input_image_urls:
|
||||||
|
media_type = url.split(";")[0].split(":")[1]
|
||||||
|
base64_data = url.split(",")[1]
|
||||||
|
content_messages.append(
|
||||||
|
{
|
||||||
|
"type": "image",
|
||||||
|
"source": {
|
||||||
|
"type": "base64",
|
||||||
|
"media_type": media_type,
|
||||||
|
"data": base64_data,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt_messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": content_messages,
|
||||||
|
},
|
||||||
|
# {"role": "assistant", "content": SECOND_MESSAGE},
|
||||||
|
# {"role": "user", "content": "continue"},
|
||||||
|
]
|
||||||
|
|
||||||
|
if is_followup:
|
||||||
|
prompt_messages += [
|
||||||
|
{"role": "assistant", "content": previous_html},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "You've done a good job with a first draft. Improve this further based on the original instructions so that the app is fully functional like in the original video.",
|
||||||
|
},
|
||||||
|
] # type: ignore
|
||||||
|
|
||||||
|
async def process_chunk(content: str):
|
||||||
|
print(content, end="", flush=True)
|
||||||
|
|
||||||
|
response_prefix = "<thinking>"
|
||||||
|
|
||||||
|
pprint_prompt(prompt_messages) # type: ignore
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
completion = await stream_claude_response_native(
|
||||||
|
system_prompt=VIDEO_PROMPT,
|
||||||
|
messages=prompt_messages,
|
||||||
|
api_key=ANTHROPIC_API_KEY,
|
||||||
|
callback=lambda x: process_chunk(x),
|
||||||
|
model=MODEL_CLAUDE_OPUS,
|
||||||
|
include_thinking=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
end_time = time.time()
|
||||||
|
|
||||||
|
# Prepend the response prefix to the completion
|
||||||
|
completion = response_prefix + completion
|
||||||
|
|
||||||
|
# Extract the outputs
|
||||||
|
html_content = extract_tag_content("html", completion)
|
||||||
|
thinking = extract_tag_content("thinking", completion)
|
||||||
|
|
||||||
|
print(thinking)
|
||||||
|
print(f"Operation took {end_time - start_time} seconds")
|
||||||
|
|
||||||
|
os.makedirs(OUTPUTS_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
# Generate a unique filename based on the current time
|
||||||
|
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||||
|
filename = f"video_test_output_{timestamp}.html"
|
||||||
|
output_path = os.path.join(OUTPUTS_DIR, filename)
|
||||||
|
|
||||||
|
# Write the HTML content to the file
|
||||||
|
with open(output_path, "w") as file:
|
||||||
|
file.write(html_content)
|
||||||
|
|
||||||
|
# Show a notification
|
||||||
|
subprocess.run(["osascript", "-e", 'display notification "Coding Complete"'])
|
||||||
|
|
||||||
|
|
||||||
|
# Extract HTML content from the completion string
|
||||||
|
def extract_tag_content(tag: str, text: str) -> str:
|
||||||
|
"""
|
||||||
|
Extracts content for a given tag from the provided text.
|
||||||
|
|
||||||
|
:param tag: The tag to search for.
|
||||||
|
:param text: The text to search within.
|
||||||
|
:return: The content found within the tag, if any.
|
||||||
|
"""
|
||||||
|
tag_start = f"<{tag}>"
|
||||||
|
tag_end = f"</{tag}>"
|
||||||
|
start_idx = text.find(tag_start)
|
||||||
|
end_idx = text.find(tag_end, start_idx)
|
||||||
|
if start_idx != -1 and end_idx != -1:
|
||||||
|
return text[start_idx : end_idx + len(tag_end)]
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def split_video_into_screenshots(video_path: str, output_dir: str, interval: int):
|
||||||
|
# Create the output directory if it doesn't exist
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Calculate the number of zeros needed for padding
|
||||||
|
# duration = float(
|
||||||
|
# subprocess.check_output(
|
||||||
|
# [
|
||||||
|
# "ffprobe",
|
||||||
|
# "-v",
|
||||||
|
# "error",
|
||||||
|
# "-show_entries",
|
||||||
|
# "format=duration",
|
||||||
|
# "-of",
|
||||||
|
# "default=noprint_wrappers=1:nokey=1",
|
||||||
|
# video_path,
|
||||||
|
# ]
|
||||||
|
# )
|
||||||
|
# )
|
||||||
|
|
||||||
|
# Run the ffmpeg command to extract screenshots
|
||||||
|
subprocess.call(
|
||||||
|
[
|
||||||
|
"ffmpeg",
|
||||||
|
"-i",
|
||||||
|
video_path,
|
||||||
|
"-vf",
|
||||||
|
f"fps=1/{interval/1000}",
|
||||||
|
f"{output_dir}/%d.jpg",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: Don't hard-code the media type
|
||||||
|
async def image_to_data_url(filepath: str):
|
||||||
|
with open(filepath, "rb") as image_file:
|
||||||
|
encoded_string = base64.b64encode(image_file.read()).decode()
|
||||||
|
return f"data:image/jpeg;base64,{encoded_string}"
|
||||||
|
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
Loading…
Reference in New Issue
Block a user