mirror of
https://github.com/open-webui/open-webui.git
synced 2025-12-11 20:05:19 +00:00
refactor: replace requests with Firecrawl SDK in search and requests Firecrawl SDK in scrape rather than langchain_community FireCrawlLoader
This commit is contained in:
parent
d11d49a08a
commit
7a3f4d85f6
4 changed files with 87 additions and 63 deletions
|
|
@ -1,11 +1,11 @@
|
|||
import logging
|
||||
from typing import Optional, List
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
from open_webui.retrieval.web.main import SearchResult, get_filtered_results
|
||||
from open_webui.env import SRC_LOG_LEVELS
|
||||
|
||||
from firecrawl import Firecrawl
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
log.setLevel(SRC_LOG_LEVELS["RAG"])
|
||||
|
||||
|
|
@ -18,27 +18,18 @@ def search_firecrawl(
|
|||
filter_list: Optional[List[str]] = None,
|
||||
) -> List[SearchResult]:
|
||||
try:
|
||||
firecrawl_search_url = urljoin(firecrawl_url, "/v1/search")
|
||||
response = requests.post(
|
||||
firecrawl_search_url,
|
||||
headers={
|
||||
"User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot",
|
||||
"Authorization": f"Bearer {firecrawl_api_key}",
|
||||
},
|
||||
json={
|
||||
"query": query,
|
||||
"limit": count,
|
||||
},
|
||||
firecrawl = Firecrawl(api_key=firecrawl_api_key, api_url=firecrawl_url)
|
||||
response = firecrawl.search(
|
||||
query=query, limit=count, ignore_invalid_urls=True, timeout=count * 3
|
||||
)
|
||||
response.raise_for_status()
|
||||
results = response.json().get("data", [])
|
||||
results = response.web
|
||||
if filter_list:
|
||||
results = get_filtered_results(results, filter_list)
|
||||
results = [
|
||||
SearchResult(
|
||||
link=result.get("url"),
|
||||
title=result.get("title"),
|
||||
snippet=result.get("description"),
|
||||
link=result.url,
|
||||
title=result.title,
|
||||
snippet=result.description,
|
||||
)
|
||||
for result in results[:count]
|
||||
]
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ import socket
|
|||
import ssl
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, time, timedelta
|
||||
from typing import (
|
||||
Any,
|
||||
|
|
@ -21,7 +20,6 @@ import aiohttp
|
|||
import certifi
|
||||
import validators
|
||||
from langchain_community.document_loaders import PlaywrightURLLoader, WebBaseLoader
|
||||
from langchain_community.document_loaders.firecrawl import FireCrawlLoader
|
||||
from langchain_community.document_loaders.base import BaseLoader
|
||||
from langchain_core.documents import Document
|
||||
from open_webui.retrieval.loaders.tavily import TavilyLoader
|
||||
|
|
@ -39,7 +37,9 @@ from open_webui.config import (
|
|||
EXTERNAL_WEB_LOADER_URL,
|
||||
EXTERNAL_WEB_LOADER_API_KEY,
|
||||
)
|
||||
from open_webui.env import SRC_LOG_LEVELS, AIOHTTP_CLIENT_SESSION_SSL
|
||||
from open_webui.env import SRC_LOG_LEVELS
|
||||
|
||||
from firecrawl import Firecrawl
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
log.setLevel(SRC_LOG_LEVELS["RAG"])
|
||||
|
|
@ -189,13 +189,12 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
|
|||
(uses FIRE_CRAWL_API_KEY environment variable if not provided).
|
||||
api_url: Base URL for FireCrawl API. Defaults to official API endpoint.
|
||||
mode: Operation mode selection:
|
||||
- 'crawl': Website crawling mode (default)
|
||||
- 'scrape': Direct page scraping
|
||||
- 'crawl': Website crawling mode
|
||||
- 'scrape': Direct page scraping (default)
|
||||
- 'map': Site map generation
|
||||
proxy: Proxy override settings for the FireCrawl API.
|
||||
params: The parameters to pass to the Firecrawl API.
|
||||
Examples include crawlerOptions.
|
||||
For more details, visit: https://github.com/mendableai/firecrawl-py
|
||||
For more details, visit: https://docs.firecrawl.dev/sdks/python#batch-scrape
|
||||
"""
|
||||
proxy_server = proxy.get("server") if proxy else None
|
||||
if trust_env and not proxy_server:
|
||||
|
|
@ -215,50 +214,84 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
|
|||
self.api_key = api_key
|
||||
self.api_url = api_url
|
||||
self.mode = mode
|
||||
self.params = params
|
||||
self.params = params or {}
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Load documents concurrently using FireCrawl."""
|
||||
for url in self.web_paths:
|
||||
try:
|
||||
self._safe_process_url_sync(url)
|
||||
loader = FireCrawlLoader(
|
||||
url=url,
|
||||
api_key=self.api_key,
|
||||
api_url=self.api_url,
|
||||
mode=self.mode,
|
||||
params=self.params,
|
||||
"""Load documents using FireCrawl batch_scrape."""
|
||||
log.debug(
|
||||
"Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s",
|
||||
len(self.web_paths),
|
||||
self.mode,
|
||||
self.params,
|
||||
)
|
||||
for document in loader.lazy_load():
|
||||
if not document.metadata.get("source"):
|
||||
document.metadata["source"] = document.metadata.get("sourceURL")
|
||||
yield document
|
||||
try:
|
||||
firecrawl = Firecrawl(api_key=self.api_key, api_url=self.api_url)
|
||||
result = firecrawl.batch_scrape(
|
||||
self.web_paths,
|
||||
formats=["markdown"],
|
||||
skip_tls_verification=not self.verify_ssl,
|
||||
ignore_invalid_urls=True,
|
||||
remove_base64_images=True,
|
||||
max_age=300000, # 5 minutes https://docs.firecrawl.dev/features/fast-scraping#common-maxage-values
|
||||
wait_timeout=len(self.web_paths) * 3,
|
||||
**self.params,
|
||||
)
|
||||
|
||||
if result.status != "completed":
|
||||
raise RuntimeError(
|
||||
f"FireCrawl batch scrape did not complete successfully. result: {result}"
|
||||
)
|
||||
|
||||
for data in result.data:
|
||||
metadata = data.metadata or {}
|
||||
yield Document(
|
||||
page_content=data.markdown or "",
|
||||
metadata={"source": metadata.url or metadata.source_url or ""},
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
if self.continue_on_failure:
|
||||
log.exception(f"Error loading {url}: {e}")
|
||||
continue
|
||||
log.exception(f"Error extracting content from URLs: {e}")
|
||||
else:
|
||||
raise e
|
||||
|
||||
async def alazy_load(self):
|
||||
"""Async version of lazy_load."""
|
||||
for url in self.web_paths:
|
||||
try:
|
||||
await self._safe_process_url(url)
|
||||
loader = FireCrawlLoader(
|
||||
url=url,
|
||||
api_key=self.api_key,
|
||||
api_url=self.api_url,
|
||||
mode=self.mode,
|
||||
params=self.params,
|
||||
log.debug(
|
||||
"Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s",
|
||||
len(self.web_paths),
|
||||
self.mode,
|
||||
self.params,
|
||||
)
|
||||
async for document in loader.alazy_load():
|
||||
if not document.metadata.get("source"):
|
||||
document.metadata["source"] = document.metadata.get("sourceURL")
|
||||
yield document
|
||||
try:
|
||||
firecrawl = Firecrawl(api_key=self.api_key, api_url=self.api_url)
|
||||
result = firecrawl.batch_scrape(
|
||||
self.web_paths,
|
||||
formats=["markdown"],
|
||||
skip_tls_verification=not self.verify_ssl,
|
||||
ignore_invalid_urls=True,
|
||||
remove_base64_images=True,
|
||||
max_age=300000, # 5 minutes https://docs.firecrawl.dev/features/fast-scraping#common-maxage-values
|
||||
wait_timeout=len(self.web_paths) * 3,
|
||||
**self.params,
|
||||
)
|
||||
|
||||
if result.status != "completed":
|
||||
raise RuntimeError(
|
||||
f"FireCrawl batch scrape did not complete successfully. result: {result}"
|
||||
)
|
||||
|
||||
for data in result.data:
|
||||
metadata = data.metadata or {}
|
||||
yield Document(
|
||||
page_content=data.markdown or "",
|
||||
metadata={"source": metadata.url or metadata.source_url or ""},
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
if self.continue_on_failure:
|
||||
log.exception(f"Error loading {url}: {e}")
|
||||
continue
|
||||
log.exception(f"Error extracting content from URLs: {e}")
|
||||
else:
|
||||
raise e
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -133,7 +133,7 @@ pytest-docker~=3.1.1
|
|||
ldap3==2.9.1
|
||||
|
||||
## Firecrawl
|
||||
firecrawl-py==1.12.0
|
||||
firecrawl-py==4.5.0
|
||||
|
||||
## Trace
|
||||
opentelemetry-api==1.37.0
|
||||
|
|
|
|||
|
|
@ -151,7 +151,7 @@ all = [
|
|||
"oracledb==3.2.0",
|
||||
|
||||
"colbert-ai==0.2.21",
|
||||
"firecrawl-py==1.12.0",
|
||||
"firecrawl-py==4.5.0",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
|
|
|||
Loading…
Reference in a new issue