Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion api/routes/tools.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Utility and Tool API endpoints."""

import logging
from typing import Optional

from fastapi import APIRouter, HTTPException, Query

Expand Down
1 change: 0 additions & 1 deletion casts/blog_writer/modules/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from typing import Optional

from langchain.agents import create_agent
from langchain_core.language_models import BaseChatModel

from .models import get_llm
from .state import LLMProvider
Expand Down
78 changes: 55 additions & 23 deletions casts/blog_writer/modules/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import json
import re
from typing import Any, Optional

import markdown

Expand All @@ -33,6 +34,50 @@
from casts.blog_writer.modules.tools import fetch_content, generate_image


def _extract_json(text: str) -> Optional[Any]:
"""Extract JSON from LLM response text.

Handles multiple formats:
1. ```json ... ``` code blocks
2. ``` ... ``` code blocks without language tag
3. Raw JSON object {...}
4. Raw JSON array [...]
"""
# Try code block with json tag
json_match = re.search(r"```json\s*([\s\S]*?)```", text)
if json_match:
try:
return json.loads(json_match.group(1).strip())
except json.JSONDecodeError:
pass

# Try code block without tag
json_match = re.search(r"```\s*([\s\S]*?)```", text)
if json_match:
try:
return json.loads(json_match.group(1).strip())
except json.JSONDecodeError:
pass

# Try to find raw JSON object
json_match = re.search(r"\{[\s\S]*\}", text)
if json_match:
try:
return json.loads(json_match.group(0))
except json.JSONDecodeError:
pass

# Try to find raw JSON array
json_match = re.search(r"\[[\s\S]*\]", text)
if json_match:
try:
return json.loads(json_match.group(0))
except json.JSONDecodeError:
pass

return None


class FetchContent(AsyncBaseNode):
"""URL에서 웹 콘텐츠 수집 (BS4/Playwright)."""

Expand Down Expand Up @@ -79,15 +124,8 @@ async def execute(self, state, config=None):
self.log("컨텐츠 분석 중...")
response = await llm.ainvoke(prompt)

try:
# Parse JSON from response
content = response.content
# Extract JSON from markdown code block if present
json_match = re.search(r"```(?:json)?\s*([\s\S]*?)```", content)
if json_match:
content = json_match.group(1)
analyzed_content = json.loads(content)
except json.JSONDecodeError:
analyzed_content = _extract_json(response.content)
if not analyzed_content:
# Fallback structure
analyzed_content = {
"title": "Untitled",
Expand Down Expand Up @@ -134,14 +172,13 @@ async def execute(self, state, config=None):
self.log("키워드 추천 중...")
response = await llm.ainvoke(prompt)

try:
content = response.content
json_match = re.search(r"```(?:json)?\s*([\s\S]*?)```", content)
if json_match:
content = json_match.group(1)
data = json.loads(content)
data = _extract_json(response.content)
if data and isinstance(data, dict) and "keywords" in data:
suggested_keywords = data.get("keywords", [])[:30]
except json.JSONDecodeError:
elif data and isinstance(data, list):
# Handle case where LLM returns just an array
suggested_keywords = data[:30]
else:
# Fallback: extract any quoted words
suggested_keywords = re.findall(r'"([^"]+)"', response.content)[:30]
Comment on lines +175 to 183
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

# First, let's locate the file and examine the code at lines 175-183
cat -n casts/blog_writer/modules/nodes.py | sed -n '170,190p'

Repository: WithModulabs/blog-agent

Length of output: 905


🏁 Script executed:

# Let's also understand what _extract_json returns
rg "_extract_json" casts/blog_writer/modules/nodes.py -A 10 -B 2

Repository: WithModulabs/blog-agent

Length of output: 1771


🏁 Script executed:

# Check for any tests or documentation about expected behavior
fd -e py casts/blog_writer | xargs rg "_extract_json" -l

Repository: WithModulabs/blog-agent

Length of output: 492


Distinguish parse failure from empty keyword lists.

The current code uses if data which treats empty lists [] as falsy and unintentionally triggers the regex fallback. Replace the truthiness check with explicit type checking and data is not None to distinguish between parse failure (None) and valid empty results, preventing noise from unnecessary fallback extraction.

Suggested fix
         data = _extract_json(response.content)
-        if data and isinstance(data, dict) and "keywords" in data:
+        if isinstance(data, dict):
+            if "keywords" in data:
                 suggested_keywords = data.get("keywords", [])[:30]
-        elif data and isinstance(data, list):
+            else:
+                # Fallback: extract any quoted words
+                suggested_keywords = re.findall(r'"([^"]+)"', response.content)[:30]
+        elif isinstance(data, list):
             # Handle case where LLM returns just an array
             suggested_keywords = data[:30]
-        else:
+        elif data is None:
             # Fallback: extract any quoted words
             suggested_keywords = re.findall(r'"([^"]+)"', response.content)[:30]
+        else:
+            suggested_keywords = []
🤖 Prompt for AI Agents
In `@casts/blog_writer/modules/nodes.py` around lines 175 - 183, The logic around
parsing response.content should distinguish parse failures (None) from valid
empty lists; update the conditional checks that set suggested_keywords to
explicitly test types and None instead of truthiness: call
_extract_json(response.content) into data and then use "if data is not None and
isinstance(data, dict)" to handle dicts (use data.get("keywords", [])[:30]),
"elif isinstance(data, list)" to handle lists (use data[:30]) and finally handle
the parse-failure case (data is None) by running the regex fallback on
response.content; reference _extract_json, response.content, and
suggested_keywords when making these changes.


Expand Down Expand Up @@ -226,13 +263,8 @@ async def execute(self, state, config=None):
self.log("SEO 최적화 중...")
response = await llm.ainvoke(prompt)

try:
content = response.content
json_match = re.search(r"```(?:json)?\s*([\s\S]*?)```", content)
if json_match:
content = json_match.group(1)
seo_meta = json.loads(content)
except json.JSONDecodeError:
seo_meta = _extract_json(response.content)
if not seo_meta:
seo_meta = {
"title": "Blog Post",
"description": blog_markdown[:160],
Expand Down
1 change: 1 addition & 0 deletions casts/blog_writer/modules/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
- SEO에 효과적인 키워드
- 검색량이 높을 것으로 예상되는 키워드
- 콘텐츠 주제와 밀접하게 관련된 키워드
- 반드시 한글 키워드로 제안 (영문 브랜드명/기술명은 그대로 사용 가능)

JSON 형식으로 키워드를 리스트로 반환하세요 (최대 30개):
{{"keywords": ["키워드1", "키워드2", ...]}}"""
Expand Down
64 changes: 60 additions & 4 deletions casts/blog_writer/modules/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,12 @@ async def fetch_with_playwright(url: str) -> str:

await page.goto(url, wait_until="networkidle")

# Get main content
content = await page.content()
# Handle Naver blog iframe
if "blog.naver.com" in url:
content = await _extract_naver_blog_content(page)
else:
content = await page.content()

await browser.close()

soup = BeautifulSoup(content, "html.parser")
Expand All @@ -74,10 +78,36 @@ async def fetch_with_playwright(url: str) -> str:
return "\n".join(lines)


async def _extract_naver_blog_content(page) -> str:
"""Extract content from Naver blog iframe.

Naver blogs load actual content inside an iframe.
"""
try:
# Wait for iframe to load
iframe_element = await page.wait_for_selector(
"iframe#mainFrame", timeout=10000
)
if iframe_element:
frame = await iframe_element.content_frame()
if frame:
# Wait for content to render inside iframe
await frame.wait_for_selector(
".se-main-container, .post-view, #postViewArea",
timeout=10000,
)
return await frame.content()
except Exception:
pass

# Fallback to main page content
return await page.content()


async def fetch_content(
url: str, scraper_type: ScraperType = ScraperType.BEAUTIFULSOUP
) -> str:
"""Fetch web content using configured scraper.
"""Fetch web content using configured scraper with fallback support.

Args:
url: URL to fetch
Expand All @@ -86,9 +116,35 @@ async def fetch_content(
Returns:
Extracted text content
"""
# If explicitly requested Playwright, use it directly
if scraper_type == ScraperType.PLAYWRIGHT:
return await fetch_with_playwright(url)
return await fetch_with_beautifulsoup(url)

# Try BeautifulSoup first
content = await fetch_with_beautifulsoup(url)

# Fallback to Playwright if content seems insufficient
# (JS-rendered sites like Naver blog return minimal content with BS4)
if len(content) < 200 or _is_js_rendered_site(url):
try:
playwright_content = await fetch_with_playwright(url)
if len(playwright_content) > len(content):
return playwright_content
except Exception:
pass # Stick with BS4 content if Playwright fails

return content


def _is_js_rendered_site(url: str) -> bool:
"""Check if URL is known to require JavaScript rendering."""
js_sites = [
"blog.naver.com",
"m.blog.naver.com",
"post.naver.com",
"brunch.co.kr",
]
return any(site in url for site in js_sites)


# =============================================================================
Expand Down
1 change: 0 additions & 1 deletion casts/chat/modules/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
- Embedding Models: https://docs.langchain.com/oss/python/integrations/text_embedding
"""

from typing import Optional

from langchain.chat_models import init_chat_model
from langchain_core.language_models import BaseChatModel
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependencies = [
"langchain-openai>=1.1.6",
"langgraph>=1.0.0",
"markdown>=3.10",
"playwright>=1.58.0",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Search for Playwright install steps, browser path config, and environment variables
echo "=== Searching for playwright install patterns ==="
rg -n "playwright install|PLAYWRIGHT_BROWSERS_PATH|PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD" --max-count=20

echo ""
echo "=== Searching in Docker/CI files ==="
rg -n "playwright" -g "Dockerfile*" -g "*.yml" -g "*.yaml" --max-count=20

echo ""
echo "=== Searching in documentation ==="
rg -n "playwright" -g "*.md" --max-count=20

echo ""
echo "=== Checking for requirements or setup files ==="
find . -name "requirements*.txt" -o -name "setup.py" -o -name "poetry.lock" -o -name "setup.cfg" 2>/dev/null | head -10

Repository: WithModulabs/blog-agent

Length of output: 369


🏁 Script executed:

#!/bin/bash
# Check requirements.txt for Playwright
echo "=== requirements.txt content ==="
cat requirements.txt

echo ""
echo "=== Check for CI/workflow files ==="
find . -type f -path "*/.github/workflows/*" -o -name "*.github.yml" -o -path "*/ci/*" 2>/dev/null | head -20

echo ""
echo "=== List CI/Docker files ==="
ls -la | grep -i docker
ls -la .github/workflows/ 2>/dev/null || echo "No .github/workflows directory"

echo ""
echo "=== Check for docker-compose or Dockerfile ==="
find . -name "Dockerfile*" -o -name "docker-compose*" 2>/dev/null

echo ""
echo "=== Context from CLAUDE.md ==="
sed -n '90,140p' casts/blog_writer/CLAUDE.md

Repository: WithModulabs/blog-agent

Length of output: 1981


🏁 Script executed:

#!/bin/bash
echo "=== CI Workflow: pr_lint.yml ==="
cat .github/workflows/pr_lint.yml

echo ""
echo "=== CI Workflow: ruff.yml ==="
cat .github/workflows/ruff.yml

echo ""
echo "=== CI Workflow: uv_lock_upgrade.yml ==="
head -50 .github/workflows/uv_lock_upgrade.yml

echo ""
echo "=== Check pyproject.toml for test/scripts config ==="
cat pyproject.toml

echo ""
echo "=== Check for README ==="
ls -la README* 2>/dev/null && head -50 README.md

Repository: WithModulabs/blog-agent

Length of output: 5545


Clarify Playwright browser setup requirements based on usage.

Playwright is an optional scraper backend (default is BEAUTIFULSOUP), so browser installation is only needed if ScraperType.PLAYWRIGHT is selected. Since this is a library/API backend, document in README or tests how consumers should install Playwright browsers if using the Playwright scraper—either via playwright install or by setting PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 with pre-installed browsers.

🤖 Prompt for AI Agents
In `@pyproject.toml` at line 17, The dependency note for Playwright should clarify
that browser binaries are only required when using the Playwright scraper
(ScraperType.PLAYWRIGHT); update documentation and tests to document
installation options: add a README section and a test/setup note showing
consumers they must run `playwright install` or set
`PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1` when they have pre-installed browsers, and
mention that Playwright is optional (default remains BEAUTIFULSOUP); reference
Playwright and ScraperType.PLAYWRIGHT in the docs and any integration tests so
users know when browser installation is necessary.

"pydantic-settings>=2.1.0",
"python-dotenv>=1.0.1",
"uvicorn[standard]>=0.34.0",
Expand Down
Loading