refactor: replace requests with Firecrawl SDK in search and requests Firecrawl SDK in scrape rather than langchain_community FireCrawlLoader

This commit is contained in:
wei840222 2025-10-26 15:00:37 +08:00
parent d11d49a08a
commit 7a3f4d85f6
4 changed files with 87 additions and 63 deletions

View file

@ -1,11 +1,11 @@
import logging import logging
from typing import Optional, List from typing import Optional, List
from urllib.parse import urljoin
import requests
from open_webui.retrieval.web.main import SearchResult, get_filtered_results from open_webui.retrieval.web.main import SearchResult, get_filtered_results
from open_webui.env import SRC_LOG_LEVELS from open_webui.env import SRC_LOG_LEVELS
from firecrawl import Firecrawl
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"]) log.setLevel(SRC_LOG_LEVELS["RAG"])
@ -18,27 +18,18 @@ def search_firecrawl(
filter_list: Optional[List[str]] = None, filter_list: Optional[List[str]] = None,
) -> List[SearchResult]: ) -> List[SearchResult]:
try: try:
firecrawl_search_url = urljoin(firecrawl_url, "/v1/search") firecrawl = Firecrawl(api_key=firecrawl_api_key, api_url=firecrawl_url)
response = requests.post( response = firecrawl.search(
firecrawl_search_url, query=query, limit=count, ignore_invalid_urls=True, timeout=count * 3
headers={
"User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot",
"Authorization": f"Bearer {firecrawl_api_key}",
},
json={
"query": query,
"limit": count,
},
) )
response.raise_for_status() results = response.web
results = response.json().get("data", [])
if filter_list: if filter_list:
results = get_filtered_results(results, filter_list) results = get_filtered_results(results, filter_list)
results = [ results = [
SearchResult( SearchResult(
link=result.get("url"), link=result.url,
title=result.get("title"), title=result.title,
snippet=result.get("description"), snippet=result.description,
) )
for result in results[:count] for result in results[:count]
] ]

View file

@ -4,7 +4,6 @@ import socket
import ssl import ssl
import urllib.parse import urllib.parse
import urllib.request import urllib.request
from collections import defaultdict
from datetime import datetime, time, timedelta from datetime import datetime, time, timedelta
from typing import ( from typing import (
Any, Any,
@ -21,7 +20,6 @@ import aiohttp
import certifi import certifi
import validators import validators
from langchain_community.document_loaders import PlaywrightURLLoader, WebBaseLoader from langchain_community.document_loaders import PlaywrightURLLoader, WebBaseLoader
from langchain_community.document_loaders.firecrawl import FireCrawlLoader
from langchain_community.document_loaders.base import BaseLoader from langchain_community.document_loaders.base import BaseLoader
from langchain_core.documents import Document from langchain_core.documents import Document
from open_webui.retrieval.loaders.tavily import TavilyLoader from open_webui.retrieval.loaders.tavily import TavilyLoader
@ -39,7 +37,9 @@ from open_webui.config import (
EXTERNAL_WEB_LOADER_URL, EXTERNAL_WEB_LOADER_URL,
EXTERNAL_WEB_LOADER_API_KEY, EXTERNAL_WEB_LOADER_API_KEY,
) )
from open_webui.env import SRC_LOG_LEVELS, AIOHTTP_CLIENT_SESSION_SSL from open_webui.env import SRC_LOG_LEVELS
from firecrawl import Firecrawl
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"]) log.setLevel(SRC_LOG_LEVELS["RAG"])
@ -189,13 +189,12 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
(uses FIRE_CRAWL_API_KEY environment variable if not provided). (uses FIRE_CRAWL_API_KEY environment variable if not provided).
api_url: Base URL for FireCrawl API. Defaults to official API endpoint. api_url: Base URL for FireCrawl API. Defaults to official API endpoint.
mode: Operation mode selection: mode: Operation mode selection:
- 'crawl': Website crawling mode (default) - 'crawl': Website crawling mode
- 'scrape': Direct page scraping - 'scrape': Direct page scraping (default)
- 'map': Site map generation - 'map': Site map generation
proxy: Proxy override settings for the FireCrawl API. proxy: Proxy override settings for the FireCrawl API.
params: The parameters to pass to the Firecrawl API. params: The parameters to pass to the Firecrawl API.
Examples include crawlerOptions. For more details, visit: https://docs.firecrawl.dev/sdks/python#batch-scrape
For more details, visit: https://github.com/mendableai/firecrawl-py
""" """
proxy_server = proxy.get("server") if proxy else None proxy_server = proxy.get("server") if proxy else None
if trust_env and not proxy_server: if trust_env and not proxy_server:
@ -215,50 +214,84 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
self.api_key = api_key self.api_key = api_key
self.api_url = api_url self.api_url = api_url
self.mode = mode self.mode = mode
self.params = params self.params = params or {}
def lazy_load(self) -> Iterator[Document]: def lazy_load(self) -> Iterator[Document]:
"""Load documents concurrently using FireCrawl.""" """Load documents using FireCrawl batch_scrape."""
for url in self.web_paths: log.debug(
try: "Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s",
self._safe_process_url_sync(url) len(self.web_paths),
loader = FireCrawlLoader( self.mode,
url=url, self.params,
api_key=self.api_key, )
api_url=self.api_url, try:
mode=self.mode, firecrawl = Firecrawl(api_key=self.api_key, api_url=self.api_url)
params=self.params, result = firecrawl.batch_scrape(
self.web_paths,
formats=["markdown"],
skip_tls_verification=not self.verify_ssl,
ignore_invalid_urls=True,
remove_base64_images=True,
max_age=300000, # 5 minutes https://docs.firecrawl.dev/features/fast-scraping#common-maxage-values
wait_timeout=len(self.web_paths) * 3,
**self.params,
)
if result.status != "completed":
raise RuntimeError(
f"FireCrawl batch scrape did not complete successfully. result: {result}"
) )
for document in loader.lazy_load():
if not document.metadata.get("source"): for data in result.data:
document.metadata["source"] = document.metadata.get("sourceURL") metadata = data.metadata or {}
yield document yield Document(
except Exception as e: page_content=data.markdown or "",
if self.continue_on_failure: metadata={"source": metadata.url or metadata.source_url or ""},
log.exception(f"Error loading {url}: {e}") )
continue
except Exception as e:
if self.continue_on_failure:
log.exception(f"Error extracting content from URLs: {e}")
else:
raise e raise e
async def alazy_load(self): async def alazy_load(self):
"""Async version of lazy_load.""" """Async version of lazy_load."""
for url in self.web_paths: log.debug(
try: "Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s",
await self._safe_process_url(url) len(self.web_paths),
loader = FireCrawlLoader( self.mode,
url=url, self.params,
api_key=self.api_key, )
api_url=self.api_url, try:
mode=self.mode, firecrawl = Firecrawl(api_key=self.api_key, api_url=self.api_url)
params=self.params, result = firecrawl.batch_scrape(
self.web_paths,
formats=["markdown"],
skip_tls_verification=not self.verify_ssl,
ignore_invalid_urls=True,
remove_base64_images=True,
max_age=300000, # 5 minutes https://docs.firecrawl.dev/features/fast-scraping#common-maxage-values
wait_timeout=len(self.web_paths) * 3,
**self.params,
)
if result.status != "completed":
raise RuntimeError(
f"FireCrawl batch scrape did not complete successfully. result: {result}"
) )
async for document in loader.alazy_load():
if not document.metadata.get("source"): for data in result.data:
document.metadata["source"] = document.metadata.get("sourceURL") metadata = data.metadata or {}
yield document yield Document(
except Exception as e: page_content=data.markdown or "",
if self.continue_on_failure: metadata={"source": metadata.url or metadata.source_url or ""},
log.exception(f"Error loading {url}: {e}") )
continue
except Exception as e:
if self.continue_on_failure:
log.exception(f"Error extracting content from URLs: {e}")
else:
raise e raise e

View file

@ -133,7 +133,7 @@ pytest-docker~=3.1.1
ldap3==2.9.1 ldap3==2.9.1
## Firecrawl ## Firecrawl
firecrawl-py==1.12.0 firecrawl-py==4.5.0
## Trace ## Trace
opentelemetry-api==1.37.0 opentelemetry-api==1.37.0

View file

@ -151,7 +151,7 @@ all = [
"oracledb==3.2.0", "oracledb==3.2.0",
"colbert-ai==0.2.21", "colbert-ai==0.2.21",
"firecrawl-py==1.12.0", "firecrawl-py==4.5.0",
] ]
[project.scripts] [project.scripts]