From edfd16ef1db5d7f4163721207aa0af065c067d9f Mon Sep 17 00:00:00 2001 From: Abi Raja Date: Wed, 26 Jun 2024 16:36:33 +0800 Subject: [PATCH] extract only html content --- backend/codegen/__init__.py | 0 backend/codegen/test_utils.py | 57 +++++++++++++++++++++++++++++++++ backend/codegen/utils.py | 14 ++++++++ backend/routes/generate_code.py | 4 +++ 4 files changed, 75 insertions(+) create mode 100644 backend/codegen/__init__.py create mode 100644 backend/codegen/test_utils.py create mode 100644 backend/codegen/utils.py diff --git a/backend/codegen/__init__.py b/backend/codegen/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/codegen/test_utils.py b/backend/codegen/test_utils.py new file mode 100644 index 0000000..cd663f8 --- /dev/null +++ b/backend/codegen/test_utils.py @@ -0,0 +1,57 @@ +import unittest +from codegen.utils import extract_html_content + + +class TestUtils(unittest.TestCase): + + def test_extract_html_content_with_html_tags(self): + text = "

Hello, World!

" + expected = "

Hello, World!

" + result = extract_html_content(text) + self.assertEqual(result, expected) + + def test_extract_html_content_without_html_tags(self): + text = "No HTML content here." + expected = "No HTML content here." + result = extract_html_content(text) + self.assertEqual(result, expected) + + def test_extract_html_content_with_partial_html_tags(self): + text = "

Hello, World!

" + expected = "

Hello, World!

" + result = extract_html_content(text) + self.assertEqual(result, expected) + + def test_extract_html_content_with_multiple_html_tags(self): + text = "

First

Some text

Second

" + expected = "

First

" + result = extract_html_content(text) + self.assertEqual(result, expected) + + ## The following are tests based on actual LLM outputs + + def test_extract_html_content_some_explanation_before(self): + text = """Got it! You want the song list to be displayed horizontally. I'll update the code to ensure that the song list is displayed in a horizontal layout. + + Here's the updated code: + + """ + expected = '' + result = extract_html_content(text) + self.assertEqual(result, expected) + + def test_markdown_tags(self): + text = "```html```" + expected = "```html```" + result = extract_html_content(text) + self.assertEqual(result, expected) + + def test_doctype_text(self): + text = '' + expected = '' + result = extract_html_content(text) + self.assertEqual(result, expected) + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/codegen/utils.py b/backend/codegen/utils.py new file mode 100644 index 0000000..107579b --- /dev/null +++ b/backend/codegen/utils.py @@ -0,0 +1,14 @@ +import re + + +def extract_html_content(text: str): + # Use regex to find content within tags and include the tags themselves + match = re.search(r"(.*?)", text, re.DOTALL) + if match: + return match.group(1) + else: + # Otherwise, we just send the previous HTML over + print( + "[HTML Extraction] No tags found in the generated content: " + text + ) + return text diff --git a/backend/routes/generate_code.py b/backend/routes/generate_code.py index d280830..6426f20 100644 --- a/backend/routes/generate_code.py +++ b/backend/routes/generate_code.py @@ -2,6 +2,7 @@ import os import traceback from fastapi import APIRouter, WebSocket import openai +from codegen.utils import extract_html_content from config import ANTHROPIC_API_KEY, IS_PROD, SHOULD_MOCK_AI_RESPONSE from custom_types import InputMode from llm import ( @@ -312,6 +313,9 @@ async def stream_code(websocket: WebSocket): print("Exact used model for generation: ", exact_llm_version) + # Strip the completion of everything except the HTML content + completion = extract_html_content(completion) + # Write the messages dict into a log so that we can debug later write_logs(prompt_messages, completion) # type: ignore