From 7a3f4d85f6a1a9567ae6f45dd8e4bedc5b9f0c13 Mon Sep 17 00:00:00 2001 From: wei840222 Date: Sun, 26 Oct 2025 15:00:37 +0800 Subject: [PATCH] refactor: replace requests with Firecrawl SDK in search and requests Firecrawl SDK in scrape rather than langchain_community FireCrawlLoader --- backend/open_webui/retrieval/web/firecrawl.py | 27 ++-- backend/open_webui/retrieval/web/utils.py | 119 +++++++++++------- backend/requirements.txt | 2 +- pyproject.toml | 2 +- 4 files changed, 87 insertions(+), 63 deletions(-) diff --git a/backend/open_webui/retrieval/web/firecrawl.py b/backend/open_webui/retrieval/web/firecrawl.py index a85fc51fbd..acad014d70 100644 --- a/backend/open_webui/retrieval/web/firecrawl.py +++ b/backend/open_webui/retrieval/web/firecrawl.py @@ -1,11 +1,11 @@ import logging from typing import Optional, List -from urllib.parse import urljoin -import requests from open_webui.retrieval.web.main import SearchResult, get_filtered_results from open_webui.env import SRC_LOG_LEVELS +from firecrawl import Firecrawl + log = logging.getLogger(__name__) log.setLevel(SRC_LOG_LEVELS["RAG"]) @@ -18,27 +18,18 @@ def search_firecrawl( filter_list: Optional[List[str]] = None, ) -> List[SearchResult]: try: - firecrawl_search_url = urljoin(firecrawl_url, "/v1/search") - response = requests.post( - firecrawl_search_url, - headers={ - "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot", - "Authorization": f"Bearer {firecrawl_api_key}", - }, - json={ - "query": query, - "limit": count, - }, + firecrawl = Firecrawl(api_key=firecrawl_api_key, api_url=firecrawl_url) + response = firecrawl.search( + query=query, limit=count, ignore_invalid_urls=True, timeout=count * 3 ) - response.raise_for_status() - results = response.json().get("data", []) + results = response.web if filter_list: results = get_filtered_results(results, filter_list) results = [ SearchResult( - link=result.get("url"), - title=result.get("title"), - snippet=result.get("description"), + link=result.url, + title=result.title, + snippet=result.description, ) for result in results[:count] ] diff --git a/backend/open_webui/retrieval/web/utils.py b/backend/open_webui/retrieval/web/utils.py index 61356adb56..f5c89b4b58 100644 --- a/backend/open_webui/retrieval/web/utils.py +++ b/backend/open_webui/retrieval/web/utils.py @@ -4,7 +4,6 @@ import socket import ssl import urllib.parse import urllib.request -from collections import defaultdict from datetime import datetime, time, timedelta from typing import ( Any, @@ -21,7 +20,6 @@ import aiohttp import certifi import validators from langchain_community.document_loaders import PlaywrightURLLoader, WebBaseLoader -from langchain_community.document_loaders.firecrawl import FireCrawlLoader from langchain_community.document_loaders.base import BaseLoader from langchain_core.documents import Document from open_webui.retrieval.loaders.tavily import TavilyLoader @@ -39,7 +37,9 @@ from open_webui.config import ( EXTERNAL_WEB_LOADER_URL, EXTERNAL_WEB_LOADER_API_KEY, ) -from open_webui.env import SRC_LOG_LEVELS, AIOHTTP_CLIENT_SESSION_SSL +from open_webui.env import SRC_LOG_LEVELS + +from firecrawl import Firecrawl log = logging.getLogger(__name__) log.setLevel(SRC_LOG_LEVELS["RAG"]) @@ -189,13 +189,12 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin): (uses FIRE_CRAWL_API_KEY environment variable if not provided). api_url: Base URL for FireCrawl API. Defaults to official API endpoint. mode: Operation mode selection: - - 'crawl': Website crawling mode (default) - - 'scrape': Direct page scraping + - 'crawl': Website crawling mode + - 'scrape': Direct page scraping (default) - 'map': Site map generation proxy: Proxy override settings for the FireCrawl API. params: The parameters to pass to the Firecrawl API. - Examples include crawlerOptions. - For more details, visit: https://github.com/mendableai/firecrawl-py + For more details, visit: https://docs.firecrawl.dev/sdks/python#batch-scrape """ proxy_server = proxy.get("server") if proxy else None if trust_env and not proxy_server: @@ -215,50 +214,84 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin): self.api_key = api_key self.api_url = api_url self.mode = mode - self.params = params + self.params = params or {} def lazy_load(self) -> Iterator[Document]: - """Load documents concurrently using FireCrawl.""" - for url in self.web_paths: - try: - self._safe_process_url_sync(url) - loader = FireCrawlLoader( - url=url, - api_key=self.api_key, - api_url=self.api_url, - mode=self.mode, - params=self.params, + """Load documents using FireCrawl batch_scrape.""" + log.debug( + "Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s", + len(self.web_paths), + self.mode, + self.params, + ) + try: + firecrawl = Firecrawl(api_key=self.api_key, api_url=self.api_url) + result = firecrawl.batch_scrape( + self.web_paths, + formats=["markdown"], + skip_tls_verification=not self.verify_ssl, + ignore_invalid_urls=True, + remove_base64_images=True, + max_age=300000, # 5 minutes https://docs.firecrawl.dev/features/fast-scraping#common-maxage-values + wait_timeout=len(self.web_paths) * 3, + **self.params, + ) + + if result.status != "completed": + raise RuntimeError( + f"FireCrawl batch scrape did not complete successfully. result: {result}" ) - for document in loader.lazy_load(): - if not document.metadata.get("source"): - document.metadata["source"] = document.metadata.get("sourceURL") - yield document - except Exception as e: - if self.continue_on_failure: - log.exception(f"Error loading {url}: {e}") - continue + + for data in result.data: + metadata = data.metadata or {} + yield Document( + page_content=data.markdown or "", + metadata={"source": metadata.url or metadata.source_url or ""}, + ) + + except Exception as e: + if self.continue_on_failure: + log.exception(f"Error extracting content from URLs: {e}") + else: raise e async def alazy_load(self): """Async version of lazy_load.""" - for url in self.web_paths: - try: - await self._safe_process_url(url) - loader = FireCrawlLoader( - url=url, - api_key=self.api_key, - api_url=self.api_url, - mode=self.mode, - params=self.params, + log.debug( + "Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s", + len(self.web_paths), + self.mode, + self.params, + ) + try: + firecrawl = Firecrawl(api_key=self.api_key, api_url=self.api_url) + result = firecrawl.batch_scrape( + self.web_paths, + formats=["markdown"], + skip_tls_verification=not self.verify_ssl, + ignore_invalid_urls=True, + remove_base64_images=True, + max_age=300000, # 5 minutes https://docs.firecrawl.dev/features/fast-scraping#common-maxage-values + wait_timeout=len(self.web_paths) * 3, + **self.params, + ) + + if result.status != "completed": + raise RuntimeError( + f"FireCrawl batch scrape did not complete successfully. result: {result}" ) - async for document in loader.alazy_load(): - if not document.metadata.get("source"): - document.metadata["source"] = document.metadata.get("sourceURL") - yield document - except Exception as e: - if self.continue_on_failure: - log.exception(f"Error loading {url}: {e}") - continue + + for data in result.data: + metadata = data.metadata or {} + yield Document( + page_content=data.markdown or "", + metadata={"source": metadata.url or metadata.source_url or ""}, + ) + + except Exception as e: + if self.continue_on_failure: + log.exception(f"Error extracting content from URLs: {e}") + else: raise e diff --git a/backend/requirements.txt b/backend/requirements.txt index 9e7ff206d5..0fdcb618fd 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -133,7 +133,7 @@ pytest-docker~=3.1.1 ldap3==2.9.1 ## Firecrawl -firecrawl-py==1.12.0 +firecrawl-py==4.5.0 ## Trace opentelemetry-api==1.37.0 diff --git a/pyproject.toml b/pyproject.toml index 1f50f8783d..87e88a1b06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -151,7 +151,7 @@ all = [ "oracledb==3.2.0", "colbert-ai==0.2.21", - "firecrawl-py==1.12.0", + "firecrawl-py==4.5.0", ] [project.scripts]