mirror of
https://github.com/open-webui/open-webui.git
synced 2025-12-11 20:05:19 +00:00
* feat: improve ollama model management experience
This commit introduces several improvements to the Ollama model management modal:
- Adds a cancel button to the model pulling operation, using the existing 'x' button pattern.
- Adds a cancel button to the "Update All" models operation, allowing the user to cancel the update for the currently processing model.
- Cleans up toast notifications when updating all models. A single toast is now shown at the beginning and a summary toast at the end, preventing notification spam.
- Refactors the `ManageOllama.svelte` component to support these new cancellation features.
- Adds tooltips to all buttons in the modal to improve clarity.
- Disables buttons when their corresponding input fields are empty to prevent accidental clicks.
* fix
* i18n: improve Chinese translation
* fix: handle non‑UTF8 chars in third‑party responses without error
* German translation of new strings in i18n
* log web search queries only with level 'debug' instead of 'info'
* Tool calls now only include text and dont inlcude other content like image b64
* fix onedrive
* fix: discovery url
* fix: default permissions not being loaded
* fix: ai hallucination
* fix: non rich text input copy
* refac: rm print statements
* refac: disable direct models from model editors
* refac/fix: do not process xlsx files with azure doc intelligence
* Update pull_request_template.md
* Update generated image translation in DE-de
* added missing danish translations
* feat(onedrive): Enable search and "My Organization" pivot
* style(onedrive): Formatting fix
* feat: Implement toggling for vertical and horizontal flow layouts
This commit introduces the necessary logic and UI controls to allow users to switch the Flow component layout between vertical and horizontal orientations.
* **`Flow.svelte` Refactoring:**
* Updates logic for calculating level offsets and node positions to consistently respect the current flow orientation.
* Adds a control panel using `<Controls>` and `<SwitchButton>` components.
* Provides user interface elements to easily switch the flow layout between horizontal and vertical orientations.
* build(deps): bump pydantic from 2.11.7 to 2.11.9 in /backend
Bumps [pydantic](https://github.com/pydantic/pydantic) from 2.11.7 to 2.11.9.
- [Release notes](https://github.com/pydantic/pydantic/releases)
- [Changelog](https://github.com/pydantic/pydantic/blob/v2.11.9/HISTORY.md)
- [Commits](https://github.com/pydantic/pydantic/compare/v2.11.7...v2.11.9)
---
updated-dependencies:
- dependency-name: pydantic
dependency-version: 2.11.9
dependency-type: direct:production
update-type: version-update:semver-patch
...
Signed-off-by: dependabot[bot] <support@github.com>
* build(deps): bump black from 25.1.0 to 25.9.0 in /backend
Bumps [black](https://github.com/psf/black) from 25.1.0 to 25.9.0.
- [Release notes](https://github.com/psf/black/releases)
- [Changelog](https://github.com/psf/black/blob/main/CHANGES.md)
- [Commits](https://github.com/psf/black/compare/25.1.0...25.9.0)
---
updated-dependencies:
- dependency-name: black
dependency-version: 25.9.0
dependency-type: direct:production
update-type: version-update:semver-minor
...
Signed-off-by: dependabot[bot] <support@github.com>
* build(deps): bump markdown from 3.8.2 to 3.9 in /backend
Bumps [markdown](https://github.com/Python-Markdown/markdown) from 3.8.2 to 3.9.
- [Release notes](https://github.com/Python-Markdown/markdown/releases)
- [Changelog](https://github.com/Python-Markdown/markdown/blob/master/docs/changelog.md)
- [Commits](https://github.com/Python-Markdown/markdown/compare/3.8.2...3.9.0)
---
updated-dependencies:
- dependency-name: markdown
dependency-version: '3.9'
dependency-type: direct:production
update-type: version-update:semver-minor
...
Signed-off-by: dependabot[bot] <support@github.com>
* build(deps): bump chromadb from 1.0.20 to 1.1.0 in /backend
Bumps [chromadb](https://github.com/chroma-core/chroma) from 1.0.20 to 1.1.0.
- [Release notes](https://github.com/chroma-core/chroma/releases)
- [Changelog](https://github.com/chroma-core/chroma/blob/main/RELEASE_PROCESS.md)
- [Commits](https://github.com/chroma-core/chroma/compare/1.0.20...1.1.0)
---
updated-dependencies:
- dependency-name: chromadb
dependency-version: 1.1.0
dependency-type: direct:production
update-type: version-update:semver-minor
...
Signed-off-by: dependabot[bot] <support@github.com>
* build(deps): bump opentelemetry-api from 1.36.0 to 1.37.0
Bumps [opentelemetry-api](https://github.com/open-telemetry/opentelemetry-python) from 1.36.0 to 1.37.0.
- [Release notes](https://github.com/open-telemetry/opentelemetry-python/releases)
- [Changelog](https://github.com/open-telemetry/opentelemetry-python/blob/main/CHANGELOG.md)
- [Commits](https://github.com/open-telemetry/opentelemetry-python/compare/v1.36.0...v1.37.0)
---
updated-dependencies:
- dependency-name: opentelemetry-api
dependency-version: 1.37.0
dependency-type: direct:production
update-type: version-update:semver-minor
...
Signed-off-by: dependabot[bot] <support@github.com>
* refac: ollama embed form data
* fix: non rich text handling
* fix: oauth client registration
* refac
* chore: dep bump
* chore: fastapi bump
* chore/refac: bump bcrypt and remove passlib
* Improving Korean Translation
* refac
* Improving Korean Translation
* feat: PWA share_target implementation
Co-Authored-By: gjveld <19951982+gjveld@users.noreply.github.com>
* refac: message input mobile detection behaviour
* feat: model_ids per folder
* Update translation.json (pt-BR)
inclusion of new translations of items that have been added
* refac
* refac
* refac
* refac
* refac/fix: temp chat
* refac
* refac: stop task
* refac/fix: azure audio escape
* refac: external tool validation
* refac/enh: start.sh additional args support
* refac
* refac: styling
* refac/fix: direct connection floating action buttons
* refac/fix: system prompt duplication
* refac/enh: openai tts additional params support
* refac
* feat: load data in parallel to accelerate page loading speed
* i18n: improve Chinese translation
* refac
* refac: model selector
* UPD: i18n es-ES Translation v0.6.33
UPD: i18n es-ES Translation v0.6.33
Updated new strings.
* refac
* improved query pref by querying only relevant columns
* refac/enh: docling params
* refac
* refac: openai additional headers support
* refac
* FEAT: Add Vega Char Visualizer Renderer
### FEAT: Add Vega Char Visualizer Renderer
Feature required in https://github.com/open-webui/open-webui/discussions/18022
Added npm vega lib to package.json
Added function for visualization renderer to src/libs/utils/index.ts
Added logic to src/lib/components/chat/Messages/CodeBlock.svelte
The treatment is similar as for mermaid diagrams.
Reference: https://vega.github.io/vega/
* refac
* chore
* refac
* FEAT: Add Vega-Lite Char Visualizer Renderer
### FEAT: Add Vega Char Visualizer Renderer
Add suport for Vega-Lite Specifications.
Vega-Lite is a "compiled" version of Vega Char Visualizer.
For be rendered with Vega it have to be compiled.
This PR add the check and compile if necessary, is a complement of recent Vega Renderer Feature added.
* refac
* refac/fix: switch
* enh/refac: url input handling
* refac
* refac: styling
* UPD: Add Validators & Error Toast for Mermaid & Vega diagrams
### UPD: Feat: Add Validators & Error Toast for Mermaid & Vega diagrams
Description:
As many time the diagrams generated or entered have syntax errors the diagrams are not rendered due to that errors, but as there isn't any notification is difficult to know what happend.
This PR add validator and toast notification when error on Mermaid and Vega/Vega-Lite diagrams, helping the user to fix its.
* removed redundant knowledge API call
* Fix Code Format
* refac: model workspace view
* refac
* refac: knowledge
* refac: prompts
* refac: tools
* refac
* feat: attach folder
* refac: make tencentcloud-sdk-python optional
* refac/fix: oauth
* enh: ENABLE_OAUTH_EMAIL_FALLBACK
* refac/fix: folders
* Update requirements.txt
* Update pyproject.toml
* UPD: Add Validators & Error Toast for Mermaid & Vega diagrams
### UPD: Feat: Add Validators & Error Toast for Mermaid & Vega diagrams
Description:
As many time the diagrams generated or entered have syntax errors the diagrams are not rendered due to that errors, but as there isn't any notification is difficult to know what happend.
This PR add validator and toast notification when error on Mermaid and Vega/Vega-Lite diagrams, helping the user to fix its.
Note:
Another possibility of integrating this Graph Visualizer is through its svelte component: https://github.com/vega/svelte-vega/tree/main/packages/svelte-vega
* Removed unused toast import & Code Format
* refac
* refac: external tool server view
* refac
* refac: overview
* refac: styling
* refac
* Update bug_report.yaml
* refac
* refac
* refac
* refac
* refac: oauth client fallback
* Fixed: Cannot handle batch sizes > 1 if no padding token is defined
Fixes Cannot handle batch sizes > 1 if no padding token is defined
For reranker models that do not have this defined in their config by using the eos_token_id if present as pad_token_id.
* refac: fallback to reasoning content
* fix(i18n): corrected typo in Spanish translation for "Reasoning Tags"
Typo fixed in Spanish translation file at line 1240 of `open-webui/src/lib/i18n/locales/es-ES/translation.json`:
- Incorrect: "Eriquetas de Razonamiento"
- Correct: "Etiquetas de Razonamiento"
This improves clarity and consistency in the UI.
* refac/fix: ENABLE_STAR_SESSIONS_MIDDLEWARE
* refac/fix: redirect
* refac
* refac
* refac
* refac: web search error handling
* refac: source parsing
* refac: functions
* refac
* refac/enh: note pdf export
* refac/fix: mcp oauth2.1
* chore: format
* chore: Changelog (#17995)
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* Update CHANGELOG.md
* refac
* chore: dep bump
---------
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: silentoplayz <jacwoo21@outlook.com>
Co-authored-by: Shirasawa <764798966@qq.com>
Co-authored-by: Jan Kessler <jakessle@uni-mainz.de>
Co-authored-by: Jacob Leksan <jacob.leksan@expedient.com>
Co-authored-by: Classic298 <27028174+Classic298@users.noreply.github.com>
Co-authored-by: sinejespersen <sinejespersen@protonmail.com>
Co-authored-by: Selene Blok <selene.blok@rws.nl>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Cyp <cypher9715@naver.com>
Co-authored-by: gjveld <19951982+gjveld@users.noreply.github.com>
Co-authored-by: joaoback <156559121+joaoback@users.noreply.github.com>
Co-authored-by: _00_ <131402327+rgaricano@users.noreply.github.com>
Co-authored-by: expruc <eygabi01@gmail.com>
Co-authored-by: YetheSamartaka <55753928+YetheSamartaka@users.noreply.github.com>
Co-authored-by: Akutangulo <akutangulo@gmail.com>
652 lines
26 KiB
Python
652 lines
26 KiB
Python
import asyncio
|
|
import logging
|
|
import socket
|
|
import ssl
|
|
import urllib.parse
|
|
import urllib.request
|
|
from collections import defaultdict
|
|
from datetime import datetime, time, timedelta
|
|
from typing import (
|
|
Any,
|
|
AsyncIterator,
|
|
Dict,
|
|
Iterator,
|
|
List,
|
|
Optional,
|
|
Sequence,
|
|
Union,
|
|
Literal,
|
|
)
|
|
import aiohttp
|
|
import certifi
|
|
import validators
|
|
from langchain_community.document_loaders import PlaywrightURLLoader, WebBaseLoader
|
|
from langchain_community.document_loaders.firecrawl import FireCrawlLoader
|
|
from langchain_community.document_loaders.base import BaseLoader
|
|
from langchain_core.documents import Document
|
|
from open_webui.retrieval.loaders.tavily import TavilyLoader
|
|
from open_webui.retrieval.loaders.external_web import ExternalWebLoader
|
|
from open_webui.constants import ERROR_MESSAGES
|
|
from open_webui.config import (
|
|
ENABLE_RAG_LOCAL_WEB_FETCH,
|
|
PLAYWRIGHT_WS_URL,
|
|
PLAYWRIGHT_TIMEOUT,
|
|
WEB_LOADER_ENGINE,
|
|
FIRECRAWL_API_BASE_URL,
|
|
FIRECRAWL_API_KEY,
|
|
TAVILY_API_KEY,
|
|
TAVILY_EXTRACT_DEPTH,
|
|
EXTERNAL_WEB_LOADER_URL,
|
|
EXTERNAL_WEB_LOADER_API_KEY,
|
|
)
|
|
from open_webui.env import SRC_LOG_LEVELS, AIOHTTP_CLIENT_SESSION_SSL
|
|
|
|
log = logging.getLogger(__name__)
|
|
log.setLevel(SRC_LOG_LEVELS["RAG"])
|
|
|
|
|
|
def validate_url(url: Union[str, Sequence[str]]):
|
|
if isinstance(url, str):
|
|
if isinstance(validators.url(url), validators.ValidationError):
|
|
raise ValueError(ERROR_MESSAGES.INVALID_URL)
|
|
if not ENABLE_RAG_LOCAL_WEB_FETCH:
|
|
# Local web fetch is disabled, filter out any URLs that resolve to private IP addresses
|
|
parsed_url = urllib.parse.urlparse(url)
|
|
# Get IPv4 and IPv6 addresses
|
|
ipv4_addresses, ipv6_addresses = resolve_hostname(parsed_url.hostname)
|
|
# Check if any of the resolved addresses are private
|
|
# This is technically still vulnerable to DNS rebinding attacks, as we don't control WebBaseLoader
|
|
for ip in ipv4_addresses:
|
|
if validators.ipv4(ip, private=True):
|
|
raise ValueError(ERROR_MESSAGES.INVALID_URL)
|
|
for ip in ipv6_addresses:
|
|
if validators.ipv6(ip, private=True):
|
|
raise ValueError(ERROR_MESSAGES.INVALID_URL)
|
|
return True
|
|
elif isinstance(url, Sequence):
|
|
return all(validate_url(u) for u in url)
|
|
else:
|
|
return False
|
|
|
|
|
|
def safe_validate_urls(url: Sequence[str]) -> Sequence[str]:
|
|
valid_urls = []
|
|
for u in url:
|
|
try:
|
|
if validate_url(u):
|
|
valid_urls.append(u)
|
|
except Exception as e:
|
|
log.debug(f"Invalid URL {u}: {str(e)}")
|
|
continue
|
|
return valid_urls
|
|
|
|
|
|
def resolve_hostname(hostname):
|
|
# Get address information
|
|
addr_info = socket.getaddrinfo(hostname, None)
|
|
|
|
# Extract IP addresses from address information
|
|
ipv4_addresses = [info[4][0] for info in addr_info if info[0] == socket.AF_INET]
|
|
ipv6_addresses = [info[4][0] for info in addr_info if info[0] == socket.AF_INET6]
|
|
|
|
return ipv4_addresses, ipv6_addresses
|
|
|
|
|
|
def extract_metadata(soup, url):
|
|
metadata = {"source": url}
|
|
if title := soup.find("title"):
|
|
metadata["title"] = title.get_text()
|
|
if description := soup.find("meta", attrs={"name": "description"}):
|
|
metadata["description"] = description.get("content", "No description found.")
|
|
if html := soup.find("html"):
|
|
metadata["language"] = html.get("lang", "No language found.")
|
|
return metadata
|
|
|
|
|
|
def verify_ssl_cert(url: str) -> bool:
|
|
"""Verify SSL certificate for the given URL."""
|
|
if not url.startswith("https://"):
|
|
return True
|
|
|
|
try:
|
|
hostname = url.split("://")[-1].split("/")[0]
|
|
context = ssl.create_default_context(cafile=certifi.where())
|
|
with context.wrap_socket(ssl.socket(), server_hostname=hostname) as s:
|
|
s.connect((hostname, 443))
|
|
return True
|
|
except ssl.SSLError:
|
|
return False
|
|
except Exception as e:
|
|
log.warning(f"SSL verification failed for {url}: {str(e)}")
|
|
return False
|
|
|
|
|
|
class RateLimitMixin:
|
|
async def _wait_for_rate_limit(self):
|
|
"""Wait to respect the rate limit if specified."""
|
|
if self.requests_per_second and self.last_request_time:
|
|
min_interval = timedelta(seconds=1.0 / self.requests_per_second)
|
|
time_since_last = datetime.now() - self.last_request_time
|
|
if time_since_last < min_interval:
|
|
await asyncio.sleep((min_interval - time_since_last).total_seconds())
|
|
self.last_request_time = datetime.now()
|
|
|
|
def _sync_wait_for_rate_limit(self):
|
|
"""Synchronous version of rate limit wait."""
|
|
if self.requests_per_second and self.last_request_time:
|
|
min_interval = timedelta(seconds=1.0 / self.requests_per_second)
|
|
time_since_last = datetime.now() - self.last_request_time
|
|
if time_since_last < min_interval:
|
|
time.sleep((min_interval - time_since_last).total_seconds())
|
|
self.last_request_time = datetime.now()
|
|
|
|
|
|
class URLProcessingMixin:
|
|
def _verify_ssl_cert(self, url: str) -> bool:
|
|
"""Verify SSL certificate for a URL."""
|
|
return verify_ssl_cert(url)
|
|
|
|
async def _safe_process_url(self, url: str) -> bool:
|
|
"""Perform safety checks before processing a URL."""
|
|
if self.verify_ssl and not self._verify_ssl_cert(url):
|
|
raise ValueError(f"SSL certificate verification failed for {url}")
|
|
await self._wait_for_rate_limit()
|
|
return True
|
|
|
|
def _safe_process_url_sync(self, url: str) -> bool:
|
|
"""Synchronous version of safety checks."""
|
|
if self.verify_ssl and not self._verify_ssl_cert(url):
|
|
raise ValueError(f"SSL certificate verification failed for {url}")
|
|
self._sync_wait_for_rate_limit()
|
|
return True
|
|
|
|
|
|
class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
|
|
def __init__(
|
|
self,
|
|
web_paths,
|
|
verify_ssl: bool = True,
|
|
trust_env: bool = False,
|
|
requests_per_second: Optional[float] = None,
|
|
continue_on_failure: bool = True,
|
|
api_key: Optional[str] = None,
|
|
api_url: Optional[str] = None,
|
|
mode: Literal["crawl", "scrape", "map"] = "scrape",
|
|
proxy: Optional[Dict[str, str]] = None,
|
|
params: Optional[Dict] = None,
|
|
):
|
|
"""Concurrent document loader for FireCrawl operations.
|
|
|
|
Executes multiple FireCrawlLoader instances concurrently using thread pooling
|
|
to improve bulk processing efficiency.
|
|
Args:
|
|
web_paths: List of URLs/paths to process.
|
|
verify_ssl: If True, verify SSL certificates.
|
|
trust_env: If True, use proxy settings from environment variables.
|
|
requests_per_second: Number of requests per second to limit to.
|
|
continue_on_failure (bool): If True, continue loading other URLs on failure.
|
|
api_key: API key for FireCrawl service. Defaults to None
|
|
(uses FIRE_CRAWL_API_KEY environment variable if not provided).
|
|
api_url: Base URL for FireCrawl API. Defaults to official API endpoint.
|
|
mode: Operation mode selection:
|
|
- 'crawl': Website crawling mode (default)
|
|
- 'scrape': Direct page scraping
|
|
- 'map': Site map generation
|
|
proxy: Proxy override settings for the FireCrawl API.
|
|
params: The parameters to pass to the Firecrawl API.
|
|
Examples include crawlerOptions.
|
|
For more details, visit: https://github.com/mendableai/firecrawl-py
|
|
"""
|
|
proxy_server = proxy.get("server") if proxy else None
|
|
if trust_env and not proxy_server:
|
|
env_proxies = urllib.request.getproxies()
|
|
env_proxy_server = env_proxies.get("https") or env_proxies.get("http")
|
|
if env_proxy_server:
|
|
if proxy:
|
|
proxy["server"] = env_proxy_server
|
|
else:
|
|
proxy = {"server": env_proxy_server}
|
|
self.web_paths = web_paths
|
|
self.verify_ssl = verify_ssl
|
|
self.requests_per_second = requests_per_second
|
|
self.last_request_time = None
|
|
self.trust_env = trust_env
|
|
self.continue_on_failure = continue_on_failure
|
|
self.api_key = api_key
|
|
self.api_url = api_url
|
|
self.mode = mode
|
|
self.params = params
|
|
|
|
def lazy_load(self) -> Iterator[Document]:
|
|
"""Load documents concurrently using FireCrawl."""
|
|
for url in self.web_paths:
|
|
try:
|
|
self._safe_process_url_sync(url)
|
|
loader = FireCrawlLoader(
|
|
url=url,
|
|
api_key=self.api_key,
|
|
api_url=self.api_url,
|
|
mode=self.mode,
|
|
params=self.params,
|
|
)
|
|
for document in loader.lazy_load():
|
|
if not document.metadata.get("source"):
|
|
document.metadata["source"] = document.metadata.get("sourceURL")
|
|
yield document
|
|
except Exception as e:
|
|
if self.continue_on_failure:
|
|
log.exception(f"Error loading {url}: {e}")
|
|
continue
|
|
raise e
|
|
|
|
async def alazy_load(self):
|
|
"""Async version of lazy_load."""
|
|
for url in self.web_paths:
|
|
try:
|
|
await self._safe_process_url(url)
|
|
loader = FireCrawlLoader(
|
|
url=url,
|
|
api_key=self.api_key,
|
|
api_url=self.api_url,
|
|
mode=self.mode,
|
|
params=self.params,
|
|
)
|
|
async for document in loader.alazy_load():
|
|
if not document.metadata.get("source"):
|
|
document.metadata["source"] = document.metadata.get("sourceURL")
|
|
yield document
|
|
except Exception as e:
|
|
if self.continue_on_failure:
|
|
log.exception(f"Error loading {url}: {e}")
|
|
continue
|
|
raise e
|
|
|
|
|
|
class SafeTavilyLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
|
|
def __init__(
|
|
self,
|
|
web_paths: Union[str, List[str]],
|
|
api_key: str,
|
|
extract_depth: Literal["basic", "advanced"] = "basic",
|
|
continue_on_failure: bool = True,
|
|
requests_per_second: Optional[float] = None,
|
|
verify_ssl: bool = True,
|
|
trust_env: bool = False,
|
|
proxy: Optional[Dict[str, str]] = None,
|
|
):
|
|
"""Initialize SafeTavilyLoader with rate limiting and SSL verification support.
|
|
|
|
Args:
|
|
web_paths: List of URLs/paths to process.
|
|
api_key: The Tavily API key.
|
|
extract_depth: Depth of extraction ("basic" or "advanced").
|
|
continue_on_failure: Whether to continue if extraction of a URL fails.
|
|
requests_per_second: Number of requests per second to limit to.
|
|
verify_ssl: If True, verify SSL certificates.
|
|
trust_env: If True, use proxy settings from environment variables.
|
|
proxy: Optional proxy configuration.
|
|
"""
|
|
# Initialize proxy configuration if using environment variables
|
|
proxy_server = proxy.get("server") if proxy else None
|
|
if trust_env and not proxy_server:
|
|
env_proxies = urllib.request.getproxies()
|
|
env_proxy_server = env_proxies.get("https") or env_proxies.get("http")
|
|
if env_proxy_server:
|
|
if proxy:
|
|
proxy["server"] = env_proxy_server
|
|
else:
|
|
proxy = {"server": env_proxy_server}
|
|
|
|
# Store parameters for creating TavilyLoader instances
|
|
self.web_paths = web_paths if isinstance(web_paths, list) else [web_paths]
|
|
self.api_key = api_key
|
|
self.extract_depth = extract_depth
|
|
self.continue_on_failure = continue_on_failure
|
|
self.verify_ssl = verify_ssl
|
|
self.trust_env = trust_env
|
|
self.proxy = proxy
|
|
|
|
# Add rate limiting
|
|
self.requests_per_second = requests_per_second
|
|
self.last_request_time = None
|
|
|
|
def lazy_load(self) -> Iterator[Document]:
|
|
"""Load documents with rate limiting support, delegating to TavilyLoader."""
|
|
valid_urls = []
|
|
for url in self.web_paths:
|
|
try:
|
|
self._safe_process_url_sync(url)
|
|
valid_urls.append(url)
|
|
except Exception as e:
|
|
log.warning(f"SSL verification failed for {url}: {str(e)}")
|
|
if not self.continue_on_failure:
|
|
raise e
|
|
if not valid_urls:
|
|
if self.continue_on_failure:
|
|
log.warning("No valid URLs to process after SSL verification")
|
|
return
|
|
raise ValueError("No valid URLs to process after SSL verification")
|
|
try:
|
|
loader = TavilyLoader(
|
|
urls=valid_urls,
|
|
api_key=self.api_key,
|
|
extract_depth=self.extract_depth,
|
|
continue_on_failure=self.continue_on_failure,
|
|
)
|
|
yield from loader.lazy_load()
|
|
except Exception as e:
|
|
if self.continue_on_failure:
|
|
log.exception(f"Error extracting content from URLs: {e}")
|
|
else:
|
|
raise e
|
|
|
|
async def alazy_load(self) -> AsyncIterator[Document]:
|
|
"""Async version with rate limiting and SSL verification."""
|
|
valid_urls = []
|
|
for url in self.web_paths:
|
|
try:
|
|
await self._safe_process_url(url)
|
|
valid_urls.append(url)
|
|
except Exception as e:
|
|
log.warning(f"SSL verification failed for {url}: {str(e)}")
|
|
if not self.continue_on_failure:
|
|
raise e
|
|
|
|
if not valid_urls:
|
|
if self.continue_on_failure:
|
|
log.warning("No valid URLs to process after SSL verification")
|
|
return
|
|
raise ValueError("No valid URLs to process after SSL verification")
|
|
|
|
try:
|
|
loader = TavilyLoader(
|
|
urls=valid_urls,
|
|
api_key=self.api_key,
|
|
extract_depth=self.extract_depth,
|
|
continue_on_failure=self.continue_on_failure,
|
|
)
|
|
async for document in loader.alazy_load():
|
|
yield document
|
|
except Exception as e:
|
|
if self.continue_on_failure:
|
|
log.exception(f"Error loading URLs: {e}")
|
|
else:
|
|
raise e
|
|
|
|
|
|
class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessingMixin):
|
|
"""Load HTML pages safely with Playwright, supporting SSL verification, rate limiting, and remote browser connection.
|
|
|
|
Attributes:
|
|
web_paths (List[str]): List of URLs to load.
|
|
verify_ssl (bool): If True, verify SSL certificates.
|
|
trust_env (bool): If True, use proxy settings from environment variables.
|
|
requests_per_second (Optional[float]): Number of requests per second to limit to.
|
|
continue_on_failure (bool): If True, continue loading other URLs on failure.
|
|
headless (bool): If True, the browser will run in headless mode.
|
|
proxy (dict): Proxy override settings for the Playwright session.
|
|
playwright_ws_url (Optional[str]): WebSocket endpoint URI for remote browser connection.
|
|
playwright_timeout (Optional[int]): Maximum operation time in milliseconds.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
web_paths: List[str],
|
|
verify_ssl: bool = True,
|
|
trust_env: bool = False,
|
|
requests_per_second: Optional[float] = None,
|
|
continue_on_failure: bool = True,
|
|
headless: bool = True,
|
|
remove_selectors: Optional[List[str]] = None,
|
|
proxy: Optional[Dict[str, str]] = None,
|
|
playwright_ws_url: Optional[str] = None,
|
|
playwright_timeout: Optional[int] = 10000,
|
|
):
|
|
"""Initialize with additional safety parameters and remote browser support."""
|
|
|
|
proxy_server = proxy.get("server") if proxy else None
|
|
if trust_env and not proxy_server:
|
|
env_proxies = urllib.request.getproxies()
|
|
env_proxy_server = env_proxies.get("https") or env_proxies.get("http")
|
|
if env_proxy_server:
|
|
if proxy:
|
|
proxy["server"] = env_proxy_server
|
|
else:
|
|
proxy = {"server": env_proxy_server}
|
|
|
|
# We'll set headless to False if using playwright_ws_url since it's handled by the remote browser
|
|
super().__init__(
|
|
urls=web_paths,
|
|
continue_on_failure=continue_on_failure,
|
|
headless=headless if playwright_ws_url is None else False,
|
|
remove_selectors=remove_selectors,
|
|
proxy=proxy,
|
|
)
|
|
self.verify_ssl = verify_ssl
|
|
self.requests_per_second = requests_per_second
|
|
self.last_request_time = None
|
|
self.playwright_ws_url = playwright_ws_url
|
|
self.trust_env = trust_env
|
|
self.playwright_timeout = playwright_timeout
|
|
|
|
def lazy_load(self) -> Iterator[Document]:
|
|
"""Safely load URLs synchronously with support for remote browser."""
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
with sync_playwright() as p:
|
|
# Use remote browser if ws_endpoint is provided, otherwise use local browser
|
|
if self.playwright_ws_url:
|
|
browser = p.chromium.connect(self.playwright_ws_url)
|
|
else:
|
|
browser = p.chromium.launch(headless=self.headless, proxy=self.proxy)
|
|
|
|
for url in self.urls:
|
|
try:
|
|
self._safe_process_url_sync(url)
|
|
page = browser.new_page()
|
|
response = page.goto(url, timeout=self.playwright_timeout)
|
|
if response is None:
|
|
raise ValueError(f"page.goto() returned None for url {url}")
|
|
|
|
text = self.evaluator.evaluate(page, browser, response)
|
|
metadata = {"source": url}
|
|
yield Document(page_content=text, metadata=metadata)
|
|
except Exception as e:
|
|
if self.continue_on_failure:
|
|
log.exception(f"Error loading {url}: {e}")
|
|
continue
|
|
raise e
|
|
browser.close()
|
|
|
|
async def alazy_load(self) -> AsyncIterator[Document]:
|
|
"""Safely load URLs asynchronously with support for remote browser."""
|
|
from playwright.async_api import async_playwright
|
|
|
|
async with async_playwright() as p:
|
|
# Use remote browser if ws_endpoint is provided, otherwise use local browser
|
|
if self.playwright_ws_url:
|
|
browser = await p.chromium.connect(self.playwright_ws_url)
|
|
else:
|
|
browser = await p.chromium.launch(
|
|
headless=self.headless, proxy=self.proxy
|
|
)
|
|
|
|
for url in self.urls:
|
|
try:
|
|
await self._safe_process_url(url)
|
|
page = await browser.new_page()
|
|
response = await page.goto(url, timeout=self.playwright_timeout)
|
|
if response is None:
|
|
raise ValueError(f"page.goto() returned None for url {url}")
|
|
|
|
text = await self.evaluator.evaluate_async(page, browser, response)
|
|
metadata = {"source": url}
|
|
yield Document(page_content=text, metadata=metadata)
|
|
except Exception as e:
|
|
if self.continue_on_failure:
|
|
log.exception(f"Error loading {url}: {e}")
|
|
continue
|
|
raise e
|
|
await browser.close()
|
|
|
|
|
|
class SafeWebBaseLoader(WebBaseLoader):
|
|
"""WebBaseLoader with enhanced error handling for URLs."""
|
|
|
|
def __init__(self, trust_env: bool = False, *args, **kwargs):
|
|
"""Initialize SafeWebBaseLoader
|
|
Args:
|
|
trust_env (bool, optional): set to True if using proxy to make web requests, for example
|
|
using http(s)_proxy environment variables. Defaults to False.
|
|
"""
|
|
super().__init__(*args, **kwargs)
|
|
self.trust_env = trust_env
|
|
|
|
async def _fetch(
|
|
self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
|
|
) -> str:
|
|
async with aiohttp.ClientSession(trust_env=self.trust_env) as session:
|
|
for i in range(retries):
|
|
try:
|
|
kwargs: Dict = dict(
|
|
headers=self.session.headers,
|
|
cookies=self.session.cookies.get_dict(),
|
|
)
|
|
if not self.session.verify:
|
|
kwargs["ssl"] = False
|
|
|
|
async with session.get(
|
|
url,
|
|
**(self.requests_kwargs | kwargs),
|
|
allow_redirects=False,
|
|
) as response:
|
|
if self.raise_for_status:
|
|
response.raise_for_status()
|
|
return await response.text()
|
|
except aiohttp.ClientConnectionError as e:
|
|
if i == retries - 1:
|
|
raise
|
|
else:
|
|
log.warning(
|
|
f"Error fetching {url} with attempt "
|
|
f"{i + 1}/{retries}: {e}. Retrying..."
|
|
)
|
|
await asyncio.sleep(cooldown * backoff**i)
|
|
raise ValueError("retry count exceeded")
|
|
|
|
def _unpack_fetch_results(
|
|
self, results: Any, urls: List[str], parser: Union[str, None] = None
|
|
) -> List[Any]:
|
|
"""Unpack fetch results into BeautifulSoup objects."""
|
|
from bs4 import BeautifulSoup
|
|
|
|
final_results = []
|
|
for i, result in enumerate(results):
|
|
url = urls[i]
|
|
if parser is None:
|
|
if url.endswith(".xml"):
|
|
parser = "xml"
|
|
else:
|
|
parser = self.default_parser
|
|
self._check_parser(parser)
|
|
final_results.append(BeautifulSoup(result, parser, **self.bs_kwargs))
|
|
return final_results
|
|
|
|
async def ascrape_all(
|
|
self, urls: List[str], parser: Union[str, None] = None
|
|
) -> List[Any]:
|
|
"""Async fetch all urls, then return soups for all results."""
|
|
results = await self.fetch_all(urls)
|
|
return self._unpack_fetch_results(results, urls, parser=parser)
|
|
|
|
def lazy_load(self) -> Iterator[Document]:
|
|
"""Lazy load text from the url(s) in web_path with error handling."""
|
|
for path in self.web_paths:
|
|
try:
|
|
soup = self._scrape(path, bs_kwargs=self.bs_kwargs)
|
|
text = soup.get_text(**self.bs_get_text_kwargs)
|
|
|
|
# Build metadata
|
|
metadata = extract_metadata(soup, path)
|
|
|
|
yield Document(page_content=text, metadata=metadata)
|
|
except Exception as e:
|
|
# Log the error and continue with the next URL
|
|
log.exception(f"Error loading {path}: {e}")
|
|
|
|
async def alazy_load(self) -> AsyncIterator[Document]:
|
|
"""Async lazy load text from the url(s) in web_path."""
|
|
results = await self.ascrape_all(self.web_paths)
|
|
for path, soup in zip(self.web_paths, results):
|
|
text = soup.get_text(**self.bs_get_text_kwargs)
|
|
metadata = {"source": path}
|
|
if title := soup.find("title"):
|
|
metadata["title"] = title.get_text()
|
|
if description := soup.find("meta", attrs={"name": "description"}):
|
|
metadata["description"] = description.get(
|
|
"content", "No description found."
|
|
)
|
|
if html := soup.find("html"):
|
|
metadata["language"] = html.get("lang", "No language found.")
|
|
yield Document(page_content=text, metadata=metadata)
|
|
|
|
async def aload(self) -> list[Document]:
|
|
"""Load data into Document objects."""
|
|
return [document async for document in self.alazy_load()]
|
|
|
|
|
|
def get_web_loader(
|
|
urls: Union[str, Sequence[str]],
|
|
verify_ssl: bool = True,
|
|
requests_per_second: int = 2,
|
|
trust_env: bool = False,
|
|
):
|
|
# Check if the URLs are valid
|
|
safe_urls = safe_validate_urls([urls] if isinstance(urls, str) else urls)
|
|
|
|
web_loader_args = {
|
|
"web_paths": safe_urls,
|
|
"verify_ssl": verify_ssl,
|
|
"requests_per_second": requests_per_second,
|
|
"continue_on_failure": True,
|
|
"trust_env": trust_env,
|
|
}
|
|
|
|
if WEB_LOADER_ENGINE.value == "" or WEB_LOADER_ENGINE.value == "safe_web":
|
|
WebLoaderClass = SafeWebBaseLoader
|
|
if WEB_LOADER_ENGINE.value == "playwright":
|
|
WebLoaderClass = SafePlaywrightURLLoader
|
|
web_loader_args["playwright_timeout"] = PLAYWRIGHT_TIMEOUT.value
|
|
if PLAYWRIGHT_WS_URL.value:
|
|
web_loader_args["playwright_ws_url"] = PLAYWRIGHT_WS_URL.value
|
|
|
|
if WEB_LOADER_ENGINE.value == "firecrawl":
|
|
WebLoaderClass = SafeFireCrawlLoader
|
|
web_loader_args["api_key"] = FIRECRAWL_API_KEY.value
|
|
web_loader_args["api_url"] = FIRECRAWL_API_BASE_URL.value
|
|
|
|
if WEB_LOADER_ENGINE.value == "tavily":
|
|
WebLoaderClass = SafeTavilyLoader
|
|
web_loader_args["api_key"] = TAVILY_API_KEY.value
|
|
web_loader_args["extract_depth"] = TAVILY_EXTRACT_DEPTH.value
|
|
|
|
if WEB_LOADER_ENGINE.value == "external":
|
|
WebLoaderClass = ExternalWebLoader
|
|
web_loader_args["external_url"] = EXTERNAL_WEB_LOADER_URL.value
|
|
web_loader_args["external_api_key"] = EXTERNAL_WEB_LOADER_API_KEY.value
|
|
|
|
if WebLoaderClass:
|
|
web_loader = WebLoaderClass(**web_loader_args)
|
|
|
|
log.debug(
|
|
"Using WEB_LOADER_ENGINE %s for %s URLs",
|
|
web_loader.__class__.__name__,
|
|
len(safe_urls),
|
|
)
|
|
|
|
return web_loader
|
|
else:
|
|
raise ValueError(
|
|
f"Invalid WEB_LOADER_ENGINE: {WEB_LOADER_ENGINE.value}. "
|
|
"Please set it to 'safe_web', 'playwright', 'firecrawl', or 'tavily'."
|
|
)
|