diff --git a/.vscode/settings.json b/.vscode/settings.json
index d6e2638..bfbce46 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,3 +1,5 @@
{
- "python.analysis.typeCheckingMode": "strict"
+ "python.analysis.typeCheckingMode": "strict",
+ "python.analysis.extraPaths": ["./backend"],
+ "python.autoComplete.extraPaths": ["./backend"]
}
diff --git a/Evaluation.md b/Evaluation.md
new file mode 100644
index 0000000..5fd5da8
--- /dev/null
+++ b/Evaluation.md
@@ -0,0 +1,19 @@
+## Evaluating models and prompts
+
+Evaluation dataset consists of 16 screenshots. A Python script for running screenshot-to-code on the dataset and a UI for rating outputs is included. With this set up, we can compare and evaluate various models and prompts.
+
+### Running evals
+
+- Input screenshots should be located at `backend/evals_data/inputs` and the outputs will be `backend/evals_data/outputs`. If you want to modify this, modify `EVALS_DIR` in `backend/evals/config.py`. You can download the input screenshot dataset here: TODO.
+- Set a stack (`STACK` var) in `backend/run_evals.py`
+- Run `python backend/run_evals.py` - this runs the screenshot-to-code on the input dataset in parallel but it will still take a few minutes to complete.
+- Once the script is done, you can find the outputs in `backend/evals_data/outputs`.
+
+### Rating evals
+
+In order to view and rate the outputs, visit your front-end at `/evals`.
+
+- Rate each output on a scale of 1-4
+- You can also print the page as PDF to share your results with others.
+
+Generally, I run three tests for each model/prompt + stack combo and take the average score out of those tests to evaluate.
diff --git a/README.md b/README.md
index bc82497..f8b6795 100644
--- a/README.md
+++ b/README.md
@@ -1,28 +1,23 @@
# screenshot-to-code
-This simple app converts a screenshot to code (HTML/Tailwind CSS, or React or Bootstrap or Vue). It uses GPT-4 Vision to generate the code and DALL-E 3 to generate similar-looking images. You can now also enter a URL to clone a live website!
+This simple app converts a screenshot to code (HTML/Tailwind CSS, or React or Bootstrap or Vue). It uses GPT-4 Vision (or Claude 3) to generate the code and DALL-E 3 to generate similar-looking images. You can now also enter a URL to clone a live website.
+
+🆕 Now, supporting Claude 3!
https://github.com/abi/screenshot-to-code/assets/23818/6cebadae-2fe3-4986-ac6a-8fb9db030045
See the [Examples](#-examples) section below for more demos.
+[Follow me on Twitter for updates](https://twitter.com/_abi_).
+
## 🚀 Try It Out!
🆕 [Try it here](https://screenshottocode.com) (bring your own OpenAI key - **your key must have access to GPT-4 Vision. See [FAQ](#%EF%B8%8F-faqs) section below for details**). Or see [Getting Started](#-getting-started) below for local install instructions.
## 🌟 Recent Updates
-- Dec 11 - Start a new project from existing code (allows you to come back to an older project)
-- Dec 7 - 🔥 🔥 🔥 View a history of your edits, and branch off them
-- Nov 30 - Dark mode, output code in Ionic (thanks [@dialmedu](https://github.com/dialmedu)), set OpenAI base URL
-- Nov 28 - 🔥 🔥 🔥 Customize your stack: React or Bootstrap or TailwindCSS
-- Nov 23 - Send in a screenshot of the current replicated version (sometimes improves quality of subsequent generations)
-- Nov 21 - Edit code in the code editor and preview changes live thanks to [@clean99](https://github.com/clean99)
-- Nov 20 - Paste in a URL to screenshot and clone (requires [ScreenshotOne free API key](https://screenshotone.com?via=screenshot-to-code))
-- Nov 19 - Support for dark/light code editor theme - thanks [@kachbit](https://github.com/kachbit)
-- Nov 16 - Added a setting to disable DALL-E image generation if you don't need that
-- Nov 16 - View code directly within the app
-- Nov 15 - You can now instruct the AI to update the code as you wish. It is helpful if the AI messed up some styles or missed a section.
+- Mar 8 - 🔥🎉🎁 Video-to-app: turn videos/screen recordings into functional apps
+- Mar 5 - Added support for Claude Sonnet 3 (as capable as or better than GPT-4 Vision, and faster!)
## 🛠 Getting Started
@@ -38,12 +33,6 @@ poetry shell
poetry run uvicorn main:app --reload --port 7001
```
-You can also run the backend (when you're in `backend`):
-
-```bash
-poetry run pyright
-```
-
Run the frontend:
```bash
@@ -62,10 +51,25 @@ For debugging purposes, if you don't want to waste GPT4-Vision credits, you can
MOCK=true poetry run uvicorn main:app --reload --port 7001
```
+## Video to app (experimental)
+
+https://github.com/abi/screenshot-to-code/assets/23818/1468bef4-164f-4046-a6c8-4cfc40a5cdff
+
+Record yourself using any website or app or even a Figma prototype, drag & drop in a video and in a few minutes, get a functional, similar-looking app.
+
+[You need an Anthropic API key for this functionality. Follow instructions here.](https://github.com/abi/screenshot-to-code/blob/main/blog/video-to-app.md)
+
## Configuration
- You can configure the OpenAI base URL if you need to use a proxy: Set OPENAI_BASE_URL in the `backend/.env` or directly in the UI in the settings dialog
+## Using Claude 3
+
+We recently added support for Claude 3 Sonnet. It performs well, on par or better than GPT-4 vision for many inputs, and it tends to be faster.
+
+1. Add an env var `ANTHROPIC_API_KEY` to `backend/.env` with your API key from Anthropic
+2. When using the front-end, select "Claude 3 Sonnet" from the model dropdown
+
## Docker
If you have Docker installed on your system, in the root directory, run:
diff --git a/Troubleshooting.md b/Troubleshooting.md
index 20fa815..ac6fe11 100644
--- a/Troubleshooting.md
+++ b/Troubleshooting.md
@@ -5,11 +5,12 @@ You don't need a ChatGPT Pro account. Screenshot to code uses API keys from your
1. Open [OpenAI Dashboard](https://platform.openai.com/)
1. Go to Settings > Billing
1. Click at the Add payment details
-
-4. You have to buy some credits. The minimum is $5.
+
+4. You have to buy some credits. The minimum is $5.
5. Go to Settings > Limits and check at the bottom of the page, your current tier has to be "Tier 1" to have GPT4 access
-
+
+
6. Go to Screenshot to code and paste it in the Settings dialog under OpenAI key (gear icon). Your key is only stored in your browser. Never stored on our servers.
Some users have also reported that it can take upto 30 minutes after your credit purchase for the GPT4 vision model to be activated.
diff --git a/backend/.gitignore b/backend/.gitignore
index a42aad3..5d03006 100644
--- a/backend/.gitignore
+++ b/backend/.gitignore
@@ -154,3 +154,7 @@ cython_debug/
# Temporary eval output
evals_data
+
+
+# Temporary video evals (Remove before merge)
+video_evals
diff --git a/backend/.pre-commit-config.yaml b/backend/.pre-commit-config.yaml
new file mode 100644
index 0000000..b54da93
--- /dev/null
+++ b/backend/.pre-commit-config.yaml
@@ -0,0 +1,25 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+repos:
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v3.2.0
+ hooks:
+ - id: end-of-file-fixer
+ - id: check-yaml
+ - id: check-added-large-files
+ - repo: local
+ hooks:
+ - id: poetry-pytest
+ name: Run pytest with Poetry
+ entry: poetry run --directory backend pytest
+ language: system
+ pass_filenames: false
+ always_run: true
+ files: ^backend/
+ # - id: poetry-pyright
+ # name: Run pyright with Poetry
+ # entry: poetry run --directory backend pyright
+ # language: system
+ # pass_filenames: false
+ # always_run: true
+ # files: ^backend/
diff --git a/backend/config.py b/backend/config.py
index f944539..f12c969 100644
--- a/backend/config.py
+++ b/backend/config.py
@@ -3,6 +3,7 @@
# TODO: Should only be set to true when value is 'True', not any abitrary truthy value
import os
+ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", None)
SHOULD_MOCK_AI_RESPONSE = bool(os.environ.get("MOCK", False))
diff --git a/backend/custom_types.py b/backend/custom_types.py
new file mode 100644
index 0000000..b6c9fee
--- /dev/null
+++ b/backend/custom_types.py
@@ -0,0 +1,7 @@
+from typing import Literal
+
+
+InputMode = Literal[
+ "image",
+ "video",
+]
diff --git a/backend/evals/core.py b/backend/evals/core.py
index 61db1a3..3438a7d 100644
--- a/backend/evals/core.py
+++ b/backend/evals/core.py
@@ -1,29 +1,40 @@
import os
+from config import ANTHROPIC_API_KEY
-from llm import stream_openai_response
+from llm import stream_claude_response, stream_openai_response
from prompts import assemble_prompt
from prompts.types import Stack
-from utils import pprint_prompt
async def generate_code_core(image_url: str, stack: Stack) -> str:
+ model = "CLAUDE"
+
prompt_messages = assemble_prompt(image_url, stack)
openai_api_key = os.environ.get("OPENAI_API_KEY")
+ anthropic_api_key = ANTHROPIC_API_KEY
openai_base_url = None
- pprint_prompt(prompt_messages)
-
async def process_chunk(content: str):
pass
- if not openai_api_key:
- raise Exception("OpenAI API key not found")
+ if model == "CLAUDE":
+ if not anthropic_api_key:
+ raise Exception("Anthropic API key not found")
- completion = await stream_openai_response(
- prompt_messages,
- api_key=openai_api_key,
- base_url=openai_base_url,
- callback=lambda x: process_chunk(x),
- )
+ completion = await stream_claude_response(
+ prompt_messages,
+ api_key=anthropic_api_key,
+ callback=lambda x: process_chunk(x),
+ )
+ else:
+ if not openai_api_key:
+ raise Exception("OpenAI API key not found")
+
+ completion = await stream_openai_response(
+ prompt_messages,
+ api_key=openai_api_key,
+ base_url=openai_base_url,
+ callback=lambda x: process_chunk(x),
+ )
return completion
diff --git a/backend/image_generation.py b/backend/image_generation.py
index d3e71b1..b93792c 100644
--- a/backend/image_generation.py
+++ b/backend/image_generation.py
@@ -15,7 +15,7 @@ async def process_tasks(prompts: List[str], api_key: str, base_url: str):
print(f"An exception occurred: {result}")
processed_results.append(None)
else:
- processed_results.append(result)
+ processed_results.append(result) # type: ignore
return processed_results
@@ -30,7 +30,7 @@ async def generate_image(prompt: str, api_key: str, base_url: str):
"size": "1024x1024",
"prompt": prompt,
}
- res = await client.images.generate(**image_params)
+ res = await client.images.generate(**image_params) # type: ignore
await client.close()
return res.data[0].url
@@ -77,26 +77,26 @@ async def generate_images(
img["src"].startswith("https://placehold.co")
and image_cache.get(img.get("alt")) is None
):
- alts.append(img.get("alt", None))
+ alts.append(img.get("alt", None)) # type: ignore
# Exclude images with no alt text
- alts = [alt for alt in alts if alt is not None]
+ alts = [alt for alt in alts if alt is not None] # type: ignore
# Remove duplicates
- prompts = list(set(alts))
+ prompts = list(set(alts)) # type: ignore
# Return early if there are no images to replace
- if len(prompts) == 0:
+ if len(prompts) == 0: # type: ignore
return code
# Generate images
- results = await process_tasks(prompts, api_key, base_url)
+ results = await process_tasks(prompts, api_key, base_url) # type: ignore
# Create a dict mapping alt text to image URL
- mapped_image_urls = dict(zip(prompts, results))
+ mapped_image_urls = dict(zip(prompts, results)) # type: ignore
# Merge with image_cache
- mapped_image_urls = {**mapped_image_urls, **image_cache}
+ mapped_image_urls = {**mapped_image_urls, **image_cache} # type: ignore
# Replace old image URLs with the generated URLs
for img in images:
diff --git a/backend/llm.py b/backend/llm.py
index 66e3a47..c7032c5 100644
--- a/backend/llm.py
+++ b/backend/llm.py
@@ -1,8 +1,21 @@
-from typing import Awaitable, Callable, List
+from typing import Any, Awaitable, Callable, List, cast
+from anthropic import AsyncAnthropic
from openai import AsyncOpenAI
from openai.types.chat import ChatCompletionMessageParam, ChatCompletionChunk
+from utils import pprint_prompt
+
MODEL_GPT_4_VISION = "gpt-4-vision-preview"
+MODEL_CLAUDE_SONNET = "claude-3-sonnet-20240229"
+MODEL_CLAUDE_OPUS = "claude-3-opus-20240229"
+MODEL_CLAUDE_HAIKU = "claude-3-haiku-20240307"
+
+
+# Keep in sync with frontend (lib/models.ts)
+CODE_GENERATION_MODELS = [
+ "gpt_4_vision",
+ "claude_3_sonnet",
+]
async def stream_openai_response(
@@ -34,3 +47,126 @@ async def stream_openai_response(
await client.close()
return full_response
+
+
+# TODO: Have a seperate function that translates OpenAI messages to Claude messages
+async def stream_claude_response(
+ messages: List[ChatCompletionMessageParam],
+ api_key: str,
+ callback: Callable[[str], Awaitable[None]],
+) -> str:
+
+ client = AsyncAnthropic(api_key=api_key)
+
+ # Base parameters
+ model = MODEL_CLAUDE_SONNET
+ max_tokens = 4096
+ temperature = 0.0
+
+ # Translate OpenAI messages to Claude messages
+ system_prompt = cast(str, messages[0]["content"])
+ claude_messages = [dict(message) for message in messages[1:]]
+ for message in claude_messages:
+ if not isinstance(message["content"], list):
+ continue
+
+ for content in message["content"]: # type: ignore
+ if content["type"] == "image_url":
+ content["type"] = "image"
+
+ # Extract base64 data and media type from data URL
+ # Example base64 data URL: data:image/png;base64,iVBOR...
+ image_data_url = cast(str, content["image_url"]["url"])
+ media_type = image_data_url.split(";")[0].split(":")[1]
+ base64_data = image_data_url.split(",")[1]
+
+ # Remove OpenAI parameter
+ del content["image_url"]
+
+ content["source"] = {
+ "type": "base64",
+ "media_type": media_type,
+ "data": base64_data,
+ }
+
+ # Stream Claude response
+ async with client.messages.stream(
+ model=model,
+ max_tokens=max_tokens,
+ temperature=temperature,
+ system=system_prompt,
+ messages=claude_messages, # type: ignore
+ ) as stream:
+ async for text in stream.text_stream:
+ await callback(text)
+
+ # Return final message
+ response = await stream.get_final_message()
+ return response.content[0].text
+
+
+async def stream_claude_response_native(
+ system_prompt: str,
+ messages: list[Any],
+ api_key: str,
+ callback: Callable[[str], Awaitable[None]],
+ include_thinking: bool = False,
+ model: str = MODEL_CLAUDE_OPUS,
+) -> str:
+
+ client = AsyncAnthropic(api_key=api_key)
+
+ # Base model parameters
+ max_tokens = 4096
+ temperature = 0.0
+
+ # Multi-pass flow
+ current_pass_num = 1
+ max_passes = 2
+
+ prefix = ""
+ response = None
+
+ while current_pass_num <= max_passes:
+ current_pass_num += 1
+
+ # Set up message depending on whether we have a prefix
+ messages_to_send = (
+ messages + [{"role": "assistant", "content": prefix}]
+ if include_thinking
+ else messages
+ )
+
+ pprint_prompt(messages_to_send)
+
+ async with client.messages.stream(
+ model=model,
+ max_tokens=max_tokens,
+ temperature=temperature,
+ system=system_prompt,
+ messages=messages_to_send, # type: ignore
+ ) as stream:
+ async for text in stream.text_stream:
+ print(text, end="", flush=True)
+ await callback(text)
+
+ # Return final message
+ response = await stream.get_final_message()
+
+ # Set up messages array for next pass
+ messages += [
+ {"role": "assistant", "content": str(prefix) + response.content[0].text},
+ {
+ "role": "user",
+ "content": "You've done a good job with a first draft. Improve this further based on the original instructions so that the app is fully functional and looks like the original video of the app we're trying to replicate.",
+ },
+ ]
+
+ print(
+ f"Token usage: Input Tokens: {response.usage.input_tokens}, Output Tokens: {response.usage.output_tokens}"
+ )
+
+ if not response:
+ raise Exception("No HTML response found in AI response")
+ else:
+ return response.content[0].text
diff --git a/backend/mock_llm.py b/backend/mock_llm.py
index 0102bad..0b903b7 100644
--- a/backend/mock_llm.py
+++ b/backend/mock_llm.py
@@ -1,14 +1,35 @@
import asyncio
from typing import Awaitable, Callable
+from custom_types import InputMode
-async def mock_completion(process_chunk: Callable[[str], Awaitable[None]]) -> str:
- code_to_return = NO_IMAGES_NYTIMES_MOCK_CODE
- for i in range(0, len(code_to_return), 10):
- await process_chunk(code_to_return[i : i + 10])
+STREAM_CHUNK_SIZE = 5
+
+
+async def mock_completion(
+ process_chunk: Callable[[str], Awaitable[None]], input_mode: InputMode
+) -> str:
+ code_to_return = (
+ GOOGLE_FORM_VIDEO_PROMPT_MOCK
+ if input_mode == "video"
+ else NO_IMAGES_NYTIMES_MOCK_CODE
+ )
+
+ for i in range(0, len(code_to_return), STREAM_CHUNK_SIZE):
+ await process_chunk(code_to_return[i : i + STREAM_CHUNK_SIZE])
await asyncio.sleep(0.01)
+ if input_mode == "video":
+ # Extract the last block from code_to_return
+ # because we can have multiple passes
+ start = code_to_return.rfind("") + len("")
+ if start != -1 and end != -1:
+ code_to_return = code_to_return[start:end]
+ else:
+ code_to_return = "Error: HTML block not found."
+
return code_to_return
@@ -206,3 +227,1291 @@ NO_IMAGES_NYTIMES_MOCK_CODE = """