refactor: replace requests with Firecrawl SDK in search and requests Firecrawl SDK in scrape rather than langchain_community FireCrawlLoader

This commit is contained in:
wei840222 2025-10-26 15:00:37 +08:00
parent d11d49a08a
commit 7a3f4d85f6
4 changed files with 87 additions and 63 deletions

View file

@ -1,11 +1,11 @@
import logging
from typing import Optional, List
from urllib.parse import urljoin
import requests
from open_webui.retrieval.web.main import SearchResult, get_filtered_results
from open_webui.env import SRC_LOG_LEVELS
from firecrawl import Firecrawl
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
@ -18,27 +18,18 @@ def search_firecrawl(
filter_list: Optional[List[str]] = None,
) -> List[SearchResult]:
try:
firecrawl_search_url = urljoin(firecrawl_url, "/v1/search")
response = requests.post(
firecrawl_search_url,
headers={
"User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot",
"Authorization": f"Bearer {firecrawl_api_key}",
},
json={
"query": query,
"limit": count,
},
firecrawl = Firecrawl(api_key=firecrawl_api_key, api_url=firecrawl_url)
response = firecrawl.search(
query=query, limit=count, ignore_invalid_urls=True, timeout=count * 3
)
response.raise_for_status()
results = response.json().get("data", [])
results = response.web
if filter_list:
results = get_filtered_results(results, filter_list)
results = [
SearchResult(
link=result.get("url"),
title=result.get("title"),
snippet=result.get("description"),
link=result.url,
title=result.title,
snippet=result.description,
)
for result in results[:count]
]

View file

@ -4,7 +4,6 @@ import socket
import ssl
import urllib.parse
import urllib.request
from collections import defaultdict
from datetime import datetime, time, timedelta
from typing import (
Any,
@ -21,7 +20,6 @@ import aiohttp
import certifi
import validators
from langchain_community.document_loaders import PlaywrightURLLoader, WebBaseLoader
from langchain_community.document_loaders.firecrawl import FireCrawlLoader
from langchain_community.document_loaders.base import BaseLoader
from langchain_core.documents import Document
from open_webui.retrieval.loaders.tavily import TavilyLoader
@ -39,7 +37,9 @@ from open_webui.config import (
EXTERNAL_WEB_LOADER_URL,
EXTERNAL_WEB_LOADER_API_KEY,
)
from open_webui.env import SRC_LOG_LEVELS, AIOHTTP_CLIENT_SESSION_SSL
from open_webui.env import SRC_LOG_LEVELS
from firecrawl import Firecrawl
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
@ -189,13 +189,12 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
(uses FIRE_CRAWL_API_KEY environment variable if not provided).
api_url: Base URL for FireCrawl API. Defaults to official API endpoint.
mode: Operation mode selection:
- 'crawl': Website crawling mode (default)
- 'scrape': Direct page scraping
- 'crawl': Website crawling mode
- 'scrape': Direct page scraping (default)
- 'map': Site map generation
proxy: Proxy override settings for the FireCrawl API.
params: The parameters to pass to the Firecrawl API.
Examples include crawlerOptions.
For more details, visit: https://github.com/mendableai/firecrawl-py
For more details, visit: https://docs.firecrawl.dev/sdks/python#batch-scrape
"""
proxy_server = proxy.get("server") if proxy else None
if trust_env and not proxy_server:
@ -215,50 +214,84 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
self.api_key = api_key
self.api_url = api_url
self.mode = mode
self.params = params
self.params = params or {}
def lazy_load(self) -> Iterator[Document]:
"""Load documents concurrently using FireCrawl."""
for url in self.web_paths:
try:
self._safe_process_url_sync(url)
loader = FireCrawlLoader(
url=url,
api_key=self.api_key,
api_url=self.api_url,
mode=self.mode,
params=self.params,
"""Load documents using FireCrawl batch_scrape."""
log.debug(
"Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s",
len(self.web_paths),
self.mode,
self.params,
)
for document in loader.lazy_load():
if not document.metadata.get("source"):
document.metadata["source"] = document.metadata.get("sourceURL")
yield document
try:
firecrawl = Firecrawl(api_key=self.api_key, api_url=self.api_url)
result = firecrawl.batch_scrape(
self.web_paths,
formats=["markdown"],
skip_tls_verification=not self.verify_ssl,
ignore_invalid_urls=True,
remove_base64_images=True,
max_age=300000, # 5 minutes https://docs.firecrawl.dev/features/fast-scraping#common-maxage-values
wait_timeout=len(self.web_paths) * 3,
**self.params,
)
if result.status != "completed":
raise RuntimeError(
f"FireCrawl batch scrape did not complete successfully. result: {result}"
)
for data in result.data:
metadata = data.metadata or {}
yield Document(
page_content=data.markdown or "",
metadata={"source": metadata.url or metadata.source_url or ""},
)
except Exception as e:
if self.continue_on_failure:
log.exception(f"Error loading {url}: {e}")
continue
log.exception(f"Error extracting content from URLs: {e}")
else:
raise e
async def alazy_load(self):
"""Async version of lazy_load."""
for url in self.web_paths:
try:
await self._safe_process_url(url)
loader = FireCrawlLoader(
url=url,
api_key=self.api_key,
api_url=self.api_url,
mode=self.mode,
params=self.params,
log.debug(
"Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s",
len(self.web_paths),
self.mode,
self.params,
)
async for document in loader.alazy_load():
if not document.metadata.get("source"):
document.metadata["source"] = document.metadata.get("sourceURL")
yield document
try:
firecrawl = Firecrawl(api_key=self.api_key, api_url=self.api_url)
result = firecrawl.batch_scrape(
self.web_paths,
formats=["markdown"],
skip_tls_verification=not self.verify_ssl,
ignore_invalid_urls=True,
remove_base64_images=True,
max_age=300000, # 5 minutes https://docs.firecrawl.dev/features/fast-scraping#common-maxage-values
wait_timeout=len(self.web_paths) * 3,
**self.params,
)
if result.status != "completed":
raise RuntimeError(
f"FireCrawl batch scrape did not complete successfully. result: {result}"
)
for data in result.data:
metadata = data.metadata or {}
yield Document(
page_content=data.markdown or "",
metadata={"source": metadata.url or metadata.source_url or ""},
)
except Exception as e:
if self.continue_on_failure:
log.exception(f"Error loading {url}: {e}")
continue
log.exception(f"Error extracting content from URLs: {e}")
else:
raise e

View file

@ -133,7 +133,7 @@ pytest-docker~=3.1.1
ldap3==2.9.1
## Firecrawl
firecrawl-py==1.12.0
firecrawl-py==4.5.0
## Trace
opentelemetry-api==1.37.0

View file

@ -151,7 +151,7 @@ all = [
"oracledb==3.2.0",
"colbert-ai==0.2.21",
"firecrawl-py==1.12.0",
"firecrawl-py==4.5.0",
]
[project.scripts]