initial version of script

2024-03-06 20:47:28 -05:00 · 2024-03-06 20:47:28 -05:00 · c2f230a8c9
commit c2f230a8c9
parent f2cb1a4dc3
4 changed files with 357 additions and 1 deletions
--- a/backend/.gitignore
+++ b/backend/.gitignore
@ -154,3 +154,7 @@ cython_debug/
 # Temporary eval output
 evals_data
 # Temporary video evals (Remove before merge)
 video_evals
--- a/backend/llm.py
+++ b/backend/llm.py
@ -1,4 +1,4 @@
-from typing import Awaitable, Callable, List, cast
+from typing import Any, Awaitable, Callable, List, cast
 from anthropic import AsyncAnthropic
 from openai import AsyncOpenAI
 from openai.types.chat import ChatCompletionMessageParam, ChatCompletionChunk
@ -46,6 +46,7 @@ async def stream_openai_response(
    return full_response
 # TODO: Have a seperate function that translates OpenAI messages to Claude messages
 async def stream_claude_response(
    messages: List[ChatCompletionMessageParam],
    api_key: str,
@ -99,3 +100,47 @@ async def stream_claude_response(
    # Return final message
    response = await stream.get_final_message()
    return response.content[0].text
 async def stream_claude_response_native(
    system_prompt: str,
    messages: list[Any],
    api_key: str,
    callback: Callable[[str], Awaitable[None]],
    include_thinking: bool = False,
    model: str = MODEL_CLAUDE_OPUS,
 ) -> str:
    client = AsyncAnthropic(api_key=api_key)
    # Base parameters
    max_tokens = 4096
    temperature = 0.0
    # Stream Claude response
    # Set up message depending on whether we have a <thinking> prefix
    messages = (
        messages + [{"role": "assistant", "content": "<thinking>"}]
        if include_thinking
        else messages
    )
    async with client.messages.stream(
        model=model,
        max_tokens=max_tokens,
        temperature=temperature,
        system=system_prompt,
        messages=messages,  # type: ignore
    ) as stream:
        async for text in stream.text_stream:
            await callback(text)
    # Return final message
    response = await stream.get_final_message()
    print(
        f"Token usage: Input Tokens: {response.usage.input_tokens}, Output Tokens: {response.usage.output_tokens}"
    )
    return response.content[0].text
--- a/backend/prompts/claude_prompts.py
+++ b/backend/prompts/claude_prompts.py
@ -4,6 +4,53 @@
 # https://docs.anthropic.com/claude/docs/prompt-engineering
 # https://github.com/anthropics/anthropic-cookbook/blob/main/multimodal/best_practices_for_vision.ipynb
 VIDEO_PROMPT = """
 You are an expert at building single page, funtional apps using HTML, Jquery and Tailwind CSS.
 You also have perfect vision and pay great attention to detail.
 You will be given screenshots in order at consistent intervals from a video of a user interacting with a web app. You need to re-create the same app exactly such that the same user interactions will produce the same results in the app you build.
 - Make sure the app looks exactly like the screenshot.
 - Pay close attention to background color, text color, font size, font family, 
 padding, margin, border, etc. Match the colors and sizes exactly.
 - For images, use placeholder images from https://placehold.co and include a detailed description of the image in the alt text so that an image generation AI can generate the image later.
 - If some fuctionality requires a backend call, just mock the data instead.
 - MAKE THE APP FUNCTIONAL using Javascript. Allow the user to interact with the app and get the same behavior as the video.
 In terms of libraries,
 - Use this script to include Tailwind: <script src="https://cdn.tailwindcss.com"></script>
 - You can use Google Fonts
 - Font Awesome for icons: <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css"></link>
 - Use jQuery: <script src="https://code.jquery.com/jquery-3.7.1.min.js"></script>
 Before generating the code for the app, think step-by-step: first, about the user flow depicated in the video and then about you how would you build it and how you would structure the code. Do the thinking within <thinking></thinking> tags. Then, provide your code within <html></html> tags.
 """
 VIDEO_PROMPT_ALPINE_JS = """
 You are an expert at building single page, funtional apps using HTML, Alpine.js and Tailwind CSS.
 You also have perfect vision and pay great attention to detail.
 You will be given screenshots in order at consistent intervals from a video of a user interacting with a web app. You need to re-create the same app exactly such that the same user interactions will produce the same results in the app you build.
 - Make sure the app looks exactly like the screenshot.
 - Pay close attention to background color, text color, font size, font family, 
 padding, margin, border, etc. Match the colors and sizes exactly.
 - For images, use placeholder images from https://placehold.co and include a detailed description of the image in the alt text so that an image generation AI can generate the image later.
 - If some fuctionality requires a backend call, just mock the data instead.
 - MAKE THE APP FUNCTIONAL using Javascript. Allow the user to interact with the app and get the same behavior as the video.
 In terms of libraries,
 - Use this script to include Tailwind: <script src="https://cdn.tailwindcss.com"></script>
 - You can use Google Fonts
 - Font Awesome for icons: <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css"></link>
 - Use Alpine.js: <script defer src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"></script>
 Before generating the code for the app, think step-by-step: first, about the user flow depicated in the video and then about you how would you build it and how you would structure the code. Do the thinking within <thinking></thinking> tags. Then, provide your code within <html></html> tags.
 """
 HTML_TAILWIND_CLAUDE_SYSTEM_PROMPT = """
 You have perfect vision and pay great attention to detail which makes you an expert at building single page apps using Tailwind, HTML and JS.
 You take screenshots of a reference web page from the user, and then build single page apps 
@ -31,3 +78,37 @@ In terms of libraries,
 Return only the full code in <html></html> tags.
 Do not include markdown "```" or "```html" at the start or end.
 """
 #
 REACT_TAILWIND_CLAUDE_SYSTEM_PROMPT = """
 You have perfect vision and pay great attention to detail which makes you an expert at building single page apps using React/Tailwind.
 You take screenshots of a reference web page from the user, and then build single page apps 
 using React and Tailwind CSS.
 You might also be given a screenshot (The second image) of a web page that you have already built, and asked to
 update it to look more like the reference image(The first image).
 - Make sure the app looks exactly like the screenshot.
 - Do not leave out smaller UI elements. Make sure to include every single thing in the screenshot.
 - Pay close attention to background color, text color, font size, font family, 
 padding, margin, border, etc. Match the colors and sizes exactly.
 - In particular, pay attention to background color and overall color scheme.
 - Use the exact text from the screenshot.
 - Do not add comments in the code such as "<!-- Add other navigation links as needed -->" and "<!-- ... other news items ... -->" in place of writing the full code. WRITE THE FULL CODE.
 - Make sure to always get the layout right (if things are arranged in a row in the screenshot, they should be in a row in the app as well)
 - CREATE REUSABLE COMPONENTS FOR REPEATING ELEMENTS. For example, if there are 15 similar items in the screenshot, your code should include a reusable component that generates these items. and use loops to instantiate these components as needed.
 - For images, use placeholder images from https://placehold.co and include a detailed description of the image in the alt text so that an image generation AI can generate the image later.
 In terms of libraries,
 - Use these script to include React so that it can run on a standalone page:
    <script src="https://unpkg.com/react/umd/react.development.js"></script>
    <script src="https://unpkg.com/react-dom/umd/react-dom.development.js"></script>
    <script src="https://unpkg.com/@babel/standalone/babel.js"></script>
 - Use this script to include Tailwind: <script src="https://cdn.tailwindcss.com"></script>
 - You can use Google Fonts
 - Font Awesome for icons: <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css"></link>
 Return only the full code in <html></html> tags.
 Do not include markdown "```" or "```html" at the start or end.
 """
--- a/backend/video_to_app.py
+++ b/backend/video_to_app.py
@ -0,0 +1,226 @@
 # Load environment variables first
 import base64
 import shutil
 from dotenv import load_dotenv
 load_dotenv()
 import time
 import subprocess
 import os
 from typing import Union
 import asyncio
 from datetime import datetime
 from prompts.claude_prompts import VIDEO_PROMPT, VIDEO_PROMPT_ALPINE_JS
 from utils import pprint_prompt
 from config import ANTHROPIC_API_KEY
 from llm import (
    MODEL_CLAUDE_OPUS,
    # MODEL_CLAUDE_SONNET,
    stream_claude_response_native,
 )
 STACK = "html_tailwind"
 VIDEO_DIR = "./video_evals/videos"
 SCREENSHOTS_DIR = "./video_evals/screenshots"
 OUTPUTS_DIR = "./video_evals/outputs"
 async def main():
    video_filename = "mortgage-calculator.mov"
    screenshot_interval = 850
    is_followup = False
    # Get previous HTML
    previous_html = ""
    if is_followup:
        previous_html_file = max(
            [
                os.path.join(OUTPUTS_DIR, f)
                for f in os.listdir(OUTPUTS_DIR)
                if f.endswith(".html")
            ],
            key=os.path.getctime,
        )
        print(previous_html_file)
        with open(previous_html_file, "r") as file:
            previous_html = file.read()
    if not ANTHROPIC_API_KEY:
        raise ValueError("ANTHROPIC_API_KEY is not set")
    # Create the SCREENSHOTS_DIR if it doesn't exist
    if not os.path.exists(SCREENSHOTS_DIR):
        os.makedirs(SCREENSHOTS_DIR)
    # Clear out the SCREENSHOTS_DIR before generating new screenshots
    for filename in os.listdir(SCREENSHOTS_DIR):
        file_path = os.path.join(SCREENSHOTS_DIR, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f"Failed to delete {file_path}. Reason: {e}")
    # Split the video into screenshots
    split_video_into_screenshots(
        os.path.join(VIDEO_DIR, video_filename), SCREENSHOTS_DIR, screenshot_interval
    )
    # Get all the screenshots in the directory
    screenshots = [f for f in os.listdir(SCREENSHOTS_DIR) if f.endswith(".jpg")]
    if len(screenshots) > 20:
        print(f"Too many screenshots: {len(screenshots)}")
        return
    input_image_urls: list[str] = []
    sorted_screenshots = sorted(screenshots, key=lambda x: int(x.split(".")[0]))
    for filename in sorted_screenshots:
        filepath = os.path.join(SCREENSHOTS_DIR, filename)
        data_url = await image_to_data_url(filepath)
        print(filename)
        input_image_urls.append(data_url)
    # Convert images to the message format for Claude
    content_messages: list[dict[str, Union[dict[str, str], str]]] = []
    for url in input_image_urls:
        media_type = url.split(";")[0].split(":")[1]
        base64_data = url.split(",")[1]
        content_messages.append(
            {
                "type": "image",
                "source": {
                    "type": "base64",
                    "media_type": media_type,
                    "data": base64_data,
                },
            }
        )
    prompt_messages = [
        {
            "role": "user",
            "content": content_messages,
        },
        # {"role": "assistant", "content": SECOND_MESSAGE},
        # {"role": "user", "content": "continue"},
    ]
    if is_followup:
        prompt_messages += [
            {"role": "assistant", "content": previous_html},
            {
                "role": "user",
                "content": "You've done a good job with a first draft. Improve this further based on the original instructions so that the app is fully functional like in the original video.",
            },
        ]  # type: ignore
    async def process_chunk(content: str):
        print(content, end="", flush=True)
    response_prefix = "<thinking>"
    pprint_prompt(prompt_messages)  # type: ignore
    start_time = time.time()
    completion = await stream_claude_response_native(
        system_prompt=VIDEO_PROMPT,
        messages=prompt_messages,
        api_key=ANTHROPIC_API_KEY,
        callback=lambda x: process_chunk(x),
        model=MODEL_CLAUDE_OPUS,
        include_thinking=True,
    )
    end_time = time.time()
    # Prepend the response prefix to the completion
    completion = response_prefix + completion
    # Extract the outputs
    html_content = extract_tag_content("html", completion)
    thinking = extract_tag_content("thinking", completion)
    print(thinking)
    print(f"Operation took {end_time - start_time} seconds")
    os.makedirs(OUTPUTS_DIR, exist_ok=True)
    # Generate a unique filename based on the current time
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    filename = f"video_test_output_{timestamp}.html"
    output_path = os.path.join(OUTPUTS_DIR, filename)
    # Write the HTML content to the file
    with open(output_path, "w") as file:
        file.write(html_content)
    # Show a notification
    subprocess.run(["osascript", "-e", 'display notification "Coding Complete"'])
 # Extract HTML content from the completion string
 def extract_tag_content(tag: str, text: str) -> str:
    """
    Extracts content for a given tag from the provided text.
    :param tag: The tag to search for.
    :param text: The text to search within.
    :return: The content found within the tag, if any.
    """
    tag_start = f"<{tag}>"
    tag_end = f"</{tag}>"
    start_idx = text.find(tag_start)
    end_idx = text.find(tag_end, start_idx)
    if start_idx != -1 and end_idx != -1:
        return text[start_idx : end_idx + len(tag_end)]
    return ""
 def split_video_into_screenshots(video_path: str, output_dir: str, interval: int):
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    # Calculate the number of zeros needed for padding
    # duration = float(
    #     subprocess.check_output(
    #         [
    #             "ffprobe",
    #             "-v",
    #             "error",
    #             "-show_entries",
    #             "format=duration",
    #             "-of",
    #             "default=noprint_wrappers=1:nokey=1",
    #             video_path,
    #         ]
    #     )
    # )
    # Run the ffmpeg command to extract screenshots
    subprocess.call(
        [
            "ffmpeg",
            "-i",
            video_path,
            "-vf",
            f"fps=1/{interval/1000}",
            f"{output_dir}/%d.jpg",
        ]
    )
 # TODO: Don't hard-code the media type
 async def image_to_data_url(filepath: str):
    with open(filepath, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read()).decode()
        return f"data:image/jpeg;base64,{encoded_string}"
 asyncio.run(main())