mirror of
https://github.com/open-webui/open-webui.git
synced 2025-12-11 20:05:19 +00:00
refactor: replace requests with Firecrawl SDK in search and requests Firecrawl SDK in scrape rather than langchain_community FireCrawlLoader
This commit is contained in:
parent
d11d49a08a
commit
7a3f4d85f6
4 changed files with 87 additions and 63 deletions
|
|
@ -1,11 +1,11 @@
|
||||||
import logging
|
import logging
|
||||||
from typing import Optional, List
|
from typing import Optional, List
|
||||||
from urllib.parse import urljoin
|
|
||||||
|
|
||||||
import requests
|
|
||||||
from open_webui.retrieval.web.main import SearchResult, get_filtered_results
|
from open_webui.retrieval.web.main import SearchResult, get_filtered_results
|
||||||
from open_webui.env import SRC_LOG_LEVELS
|
from open_webui.env import SRC_LOG_LEVELS
|
||||||
|
|
||||||
|
from firecrawl import Firecrawl
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
log.setLevel(SRC_LOG_LEVELS["RAG"])
|
log.setLevel(SRC_LOG_LEVELS["RAG"])
|
||||||
|
|
||||||
|
|
@ -18,27 +18,18 @@ def search_firecrawl(
|
||||||
filter_list: Optional[List[str]] = None,
|
filter_list: Optional[List[str]] = None,
|
||||||
) -> List[SearchResult]:
|
) -> List[SearchResult]:
|
||||||
try:
|
try:
|
||||||
firecrawl_search_url = urljoin(firecrawl_url, "/v1/search")
|
firecrawl = Firecrawl(api_key=firecrawl_api_key, api_url=firecrawl_url)
|
||||||
response = requests.post(
|
response = firecrawl.search(
|
||||||
firecrawl_search_url,
|
query=query, limit=count, ignore_invalid_urls=True, timeout=count * 3
|
||||||
headers={
|
|
||||||
"User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot",
|
|
||||||
"Authorization": f"Bearer {firecrawl_api_key}",
|
|
||||||
},
|
|
||||||
json={
|
|
||||||
"query": query,
|
|
||||||
"limit": count,
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
response.raise_for_status()
|
results = response.web
|
||||||
results = response.json().get("data", [])
|
|
||||||
if filter_list:
|
if filter_list:
|
||||||
results = get_filtered_results(results, filter_list)
|
results = get_filtered_results(results, filter_list)
|
||||||
results = [
|
results = [
|
||||||
SearchResult(
|
SearchResult(
|
||||||
link=result.get("url"),
|
link=result.url,
|
||||||
title=result.get("title"),
|
title=result.title,
|
||||||
snippet=result.get("description"),
|
snippet=result.description,
|
||||||
)
|
)
|
||||||
for result in results[:count]
|
for result in results[:count]
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ import socket
|
||||||
import ssl
|
import ssl
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import urllib.request
|
import urllib.request
|
||||||
from collections import defaultdict
|
|
||||||
from datetime import datetime, time, timedelta
|
from datetime import datetime, time, timedelta
|
||||||
from typing import (
|
from typing import (
|
||||||
Any,
|
Any,
|
||||||
|
|
@ -21,7 +20,6 @@ import aiohttp
|
||||||
import certifi
|
import certifi
|
||||||
import validators
|
import validators
|
||||||
from langchain_community.document_loaders import PlaywrightURLLoader, WebBaseLoader
|
from langchain_community.document_loaders import PlaywrightURLLoader, WebBaseLoader
|
||||||
from langchain_community.document_loaders.firecrawl import FireCrawlLoader
|
|
||||||
from langchain_community.document_loaders.base import BaseLoader
|
from langchain_community.document_loaders.base import BaseLoader
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from open_webui.retrieval.loaders.tavily import TavilyLoader
|
from open_webui.retrieval.loaders.tavily import TavilyLoader
|
||||||
|
|
@ -39,7 +37,9 @@ from open_webui.config import (
|
||||||
EXTERNAL_WEB_LOADER_URL,
|
EXTERNAL_WEB_LOADER_URL,
|
||||||
EXTERNAL_WEB_LOADER_API_KEY,
|
EXTERNAL_WEB_LOADER_API_KEY,
|
||||||
)
|
)
|
||||||
from open_webui.env import SRC_LOG_LEVELS, AIOHTTP_CLIENT_SESSION_SSL
|
from open_webui.env import SRC_LOG_LEVELS
|
||||||
|
|
||||||
|
from firecrawl import Firecrawl
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
log.setLevel(SRC_LOG_LEVELS["RAG"])
|
log.setLevel(SRC_LOG_LEVELS["RAG"])
|
||||||
|
|
@ -189,13 +189,12 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
|
||||||
(uses FIRE_CRAWL_API_KEY environment variable if not provided).
|
(uses FIRE_CRAWL_API_KEY environment variable if not provided).
|
||||||
api_url: Base URL for FireCrawl API. Defaults to official API endpoint.
|
api_url: Base URL for FireCrawl API. Defaults to official API endpoint.
|
||||||
mode: Operation mode selection:
|
mode: Operation mode selection:
|
||||||
- 'crawl': Website crawling mode (default)
|
- 'crawl': Website crawling mode
|
||||||
- 'scrape': Direct page scraping
|
- 'scrape': Direct page scraping (default)
|
||||||
- 'map': Site map generation
|
- 'map': Site map generation
|
||||||
proxy: Proxy override settings for the FireCrawl API.
|
proxy: Proxy override settings for the FireCrawl API.
|
||||||
params: The parameters to pass to the Firecrawl API.
|
params: The parameters to pass to the Firecrawl API.
|
||||||
Examples include crawlerOptions.
|
For more details, visit: https://docs.firecrawl.dev/sdks/python#batch-scrape
|
||||||
For more details, visit: https://github.com/mendableai/firecrawl-py
|
|
||||||
"""
|
"""
|
||||||
proxy_server = proxy.get("server") if proxy else None
|
proxy_server = proxy.get("server") if proxy else None
|
||||||
if trust_env and not proxy_server:
|
if trust_env and not proxy_server:
|
||||||
|
|
@ -215,50 +214,84 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
|
||||||
self.api_key = api_key
|
self.api_key = api_key
|
||||||
self.api_url = api_url
|
self.api_url = api_url
|
||||||
self.mode = mode
|
self.mode = mode
|
||||||
self.params = params
|
self.params = params or {}
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Load documents concurrently using FireCrawl."""
|
"""Load documents using FireCrawl batch_scrape."""
|
||||||
for url in self.web_paths:
|
log.debug(
|
||||||
try:
|
"Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s",
|
||||||
self._safe_process_url_sync(url)
|
len(self.web_paths),
|
||||||
loader = FireCrawlLoader(
|
self.mode,
|
||||||
url=url,
|
self.params,
|
||||||
api_key=self.api_key,
|
)
|
||||||
api_url=self.api_url,
|
try:
|
||||||
mode=self.mode,
|
firecrawl = Firecrawl(api_key=self.api_key, api_url=self.api_url)
|
||||||
params=self.params,
|
result = firecrawl.batch_scrape(
|
||||||
|
self.web_paths,
|
||||||
|
formats=["markdown"],
|
||||||
|
skip_tls_verification=not self.verify_ssl,
|
||||||
|
ignore_invalid_urls=True,
|
||||||
|
remove_base64_images=True,
|
||||||
|
max_age=300000, # 5 minutes https://docs.firecrawl.dev/features/fast-scraping#common-maxage-values
|
||||||
|
wait_timeout=len(self.web_paths) * 3,
|
||||||
|
**self.params,
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.status != "completed":
|
||||||
|
raise RuntimeError(
|
||||||
|
f"FireCrawl batch scrape did not complete successfully. result: {result}"
|
||||||
)
|
)
|
||||||
for document in loader.lazy_load():
|
|
||||||
if not document.metadata.get("source"):
|
for data in result.data:
|
||||||
document.metadata["source"] = document.metadata.get("sourceURL")
|
metadata = data.metadata or {}
|
||||||
yield document
|
yield Document(
|
||||||
except Exception as e:
|
page_content=data.markdown or "",
|
||||||
if self.continue_on_failure:
|
metadata={"source": metadata.url or metadata.source_url or ""},
|
||||||
log.exception(f"Error loading {url}: {e}")
|
)
|
||||||
continue
|
|
||||||
|
except Exception as e:
|
||||||
|
if self.continue_on_failure:
|
||||||
|
log.exception(f"Error extracting content from URLs: {e}")
|
||||||
|
else:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
async def alazy_load(self):
|
async def alazy_load(self):
|
||||||
"""Async version of lazy_load."""
|
"""Async version of lazy_load."""
|
||||||
for url in self.web_paths:
|
log.debug(
|
||||||
try:
|
"Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s",
|
||||||
await self._safe_process_url(url)
|
len(self.web_paths),
|
||||||
loader = FireCrawlLoader(
|
self.mode,
|
||||||
url=url,
|
self.params,
|
||||||
api_key=self.api_key,
|
)
|
||||||
api_url=self.api_url,
|
try:
|
||||||
mode=self.mode,
|
firecrawl = Firecrawl(api_key=self.api_key, api_url=self.api_url)
|
||||||
params=self.params,
|
result = firecrawl.batch_scrape(
|
||||||
|
self.web_paths,
|
||||||
|
formats=["markdown"],
|
||||||
|
skip_tls_verification=not self.verify_ssl,
|
||||||
|
ignore_invalid_urls=True,
|
||||||
|
remove_base64_images=True,
|
||||||
|
max_age=300000, # 5 minutes https://docs.firecrawl.dev/features/fast-scraping#common-maxage-values
|
||||||
|
wait_timeout=len(self.web_paths) * 3,
|
||||||
|
**self.params,
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.status != "completed":
|
||||||
|
raise RuntimeError(
|
||||||
|
f"FireCrawl batch scrape did not complete successfully. result: {result}"
|
||||||
)
|
)
|
||||||
async for document in loader.alazy_load():
|
|
||||||
if not document.metadata.get("source"):
|
for data in result.data:
|
||||||
document.metadata["source"] = document.metadata.get("sourceURL")
|
metadata = data.metadata or {}
|
||||||
yield document
|
yield Document(
|
||||||
except Exception as e:
|
page_content=data.markdown or "",
|
||||||
if self.continue_on_failure:
|
metadata={"source": metadata.url or metadata.source_url or ""},
|
||||||
log.exception(f"Error loading {url}: {e}")
|
)
|
||||||
continue
|
|
||||||
|
except Exception as e:
|
||||||
|
if self.continue_on_failure:
|
||||||
|
log.exception(f"Error extracting content from URLs: {e}")
|
||||||
|
else:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -133,7 +133,7 @@ pytest-docker~=3.1.1
|
||||||
ldap3==2.9.1
|
ldap3==2.9.1
|
||||||
|
|
||||||
## Firecrawl
|
## Firecrawl
|
||||||
firecrawl-py==1.12.0
|
firecrawl-py==4.5.0
|
||||||
|
|
||||||
## Trace
|
## Trace
|
||||||
opentelemetry-api==1.37.0
|
opentelemetry-api==1.37.0
|
||||||
|
|
|
||||||
|
|
@ -151,7 +151,7 @@ all = [
|
||||||
"oracledb==3.2.0",
|
"oracledb==3.2.0",
|
||||||
|
|
||||||
"colbert-ai==0.2.21",
|
"colbert-ai==0.2.21",
|
||||||
"firecrawl-py==1.12.0",
|
"firecrawl-py==4.5.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue