This commit is contained in:
Timothy Jaeryang Baek 2025-12-21 18:08:36 +04:00
parent 7746e9f4b8
commit c96549eaa7
7 changed files with 170 additions and 71 deletions

View file

@ -14,6 +14,7 @@ from typing import Iterator, List, Optional, Sequence, Union
from fastapi import ( from fastapi import (
Depends, Depends,
FastAPI, FastAPI,
Query,
File, File,
Form, Form,
HTTPException, HTTPException,
@ -155,7 +156,9 @@ def get_rf(
): ):
rf = None rf = None
# Convert timeout string to int or None (system default) # Convert timeout string to int or None (system default)
timeout_value = int(external_reranker_timeout) if external_reranker_timeout else None timeout_value = (
int(external_reranker_timeout) if external_reranker_timeout else None
)
if reranking_model: if reranking_model:
if any(model in reranking_model for model in ["jinaai/jina-colbert-v2"]): if any(model in reranking_model for model in ["jinaai/jina-colbert-v2"]):
try: try:
@ -1750,44 +1753,53 @@ async def process_text(
@router.post("/process/youtube") @router.post("/process/youtube")
@router.post("/process/web") @router.post("/process/web")
async def process_web( async def process_web(
request: Request, form_data: ProcessUrlForm, user=Depends(get_verified_user) request: Request,
form_data: ProcessUrlForm,
process: bool = Query(True, description="Whether to process and save the content"),
user=Depends(get_verified_user),
): ):
try: try:
collection_name = form_data.collection_name
if not collection_name:
collection_name = calculate_sha256_string(form_data.url)[:63]
content, docs = await run_in_threadpool( content, docs = await run_in_threadpool(
get_content_from_url, request, form_data.url get_content_from_url, request, form_data.url
) )
log.debug(f"text_content: {content}") log.debug(f"text_content: {content}")
if not request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL: if process:
await run_in_threadpool( collection_name = form_data.collection_name
save_docs_to_vector_db, if not collection_name:
request, collection_name = calculate_sha256_string(form_data.url)[:63]
docs,
collection_name,
overwrite=True,
user=user,
)
else:
collection_name = None
return { if not request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL:
"status": True, await run_in_threadpool(
"collection_name": collection_name, save_docs_to_vector_db,
"filename": form_data.url, request,
"file": { docs,
"data": { collection_name,
"content": content, overwrite=True,
user=user,
)
else:
collection_name = None
return {
"status": True,
"collection_name": collection_name,
"filename": form_data.url,
"file": {
"data": {
"content": content,
},
"meta": {
"name": form_data.url,
"source": form_data.url,
},
}, },
"meta": { }
"name": form_data.url, else:
"source": form_data.url, return {
}, "status": True,
}, "content": content,
} }
except Exception as e: except Exception as e:
log.exception(e) log.exception(e)
raise HTTPException( raise HTTPException(

View file

@ -327,10 +327,21 @@ export const processYoutubeVideo = async (token: string, url: string) => {
return res; return res;
}; };
export const processWeb = async (token: string, collection_name: string, url: string) => { export const processWeb = async (
token: string,
collection_name: string,
url: string,
process: boolean = true
) => {
let error = null; let error = null;
const res = await fetch(`${RETRIEVAL_API_BASE_URL}/process/web`, { const searchParams = new URLSearchParams();
if (!process) {
searchParams.append('process', 'false');
}
const res = await fetch(`${RETRIEVAL_API_BASE_URL}/process/web?${searchParams.toString()}`, {
method: 'POST', method: 'POST',
headers: { headers: {
Accept: 'application/json', Accept: 'application/json',

View file

@ -779,9 +779,6 @@
urls = [urls]; urls = [urls];
} }
// deduplicate URLs
urls = [...new Set(urls)];
// Create file items first // Create file items first
const fileItems = urls.map((url) => ({ const fileItems = urls.map((url) => ({
type: 'text', type: 'text',
@ -796,7 +793,6 @@
// Display all items at once // Display all items at once
files = [...files, ...fileItems]; files = [...files, ...fileItems];
// Process sequentially (NOT parallel)
for (const fileItem of fileItems) { for (const fileItem of fileItems) {
try { try {
const res = isYoutubeUrl(fileItem.url) const res = isYoutubeUrl(fileItem.url)
@ -811,14 +807,12 @@
...fileItem.file ...fileItem.file
}; };
} }
files = [...files];
} catch (e) { } catch (e) {
fileItem.status = 'error'; files = files.filter((f) => f.name !== url);
fileItem.error = String(e);
toast.error(`${e}`); toast.error(`${e}`);
} }
// Force UI reactivity after each file finishes
files = [...files];
} }
}; };

View file

@ -93,6 +93,7 @@
const i18n = getContext('i18n'); const i18n = getContext('i18n');
export let onUpload: Function = (e) => {};
export let onChange: Function = () => {}; export let onChange: Function = () => {};
export let createMessagePair: Function; export let createMessagePair: Function;

View file

@ -15,12 +15,14 @@
let url = ''; let url = '';
const submitHandler = () => { const submitHandler = () => {
const urls = url let urls = url
.split('\n') .split('\n')
.map((u) => u.trim()) .map((u) => u.trim())
.filter((u) => u !== '') .filter((u) => u !== '')
.filter((u) => isValidHttpUrl(u)); .filter((u) => isValidHttpUrl(u));
urls = [...new Set(urls)];
if (urls.length === 0) { if (urls.length === 0) {
toast.error($i18n.t('Please enter a valid URL.')); toast.error($i18n.t('Please enter a valid URL.'));
return; return;

View file

@ -33,7 +33,9 @@
updateKnowledgeById, updateKnowledgeById,
searchKnowledgeFilesById searchKnowledgeFilesById
} from '$lib/apis/knowledge'; } from '$lib/apis/knowledge';
import { blobToFile } from '$lib/utils'; import { processWeb, processYoutubeVideo } from '$lib/apis/retrieval';
import { blobToFile, isYoutubeUrl } from '$lib/utils';
import Spinner from '$lib/components/common/Spinner.svelte'; import Spinner from '$lib/components/common/Spinner.svelte';
import Files from './KnowledgeBase/Files.svelte'; import Files from './KnowledgeBase/Files.svelte';
@ -169,10 +171,85 @@
return file; return file;
}; };
const uploadWeb = async (urls) => {
if (!Array.isArray(urls)) {
urls = [urls];
}
const newFileItems = urls.map((url) => ({
type: 'file',
file: '',
id: null,
url: url,
name: url,
size: null,
status: 'uploading',
error: '',
itemId: uuidv4()
}));
// Display all items at once
fileItems = [...newFileItems, ...(fileItems ?? [])];
for (const fileItem of newFileItems) {
try {
console.log(fileItem);
const res = await processWeb(localStorage.token, '', fileItem.url, false).catch((e) => {
console.error('Error processing web URL:', e);
return null;
});
if (res) {
console.log(res);
const file = createFileFromText(
// Use URL as filename, sanitized
fileItem.url
.replace(/[^a-z0-9]/gi, '_')
.toLowerCase()
.slice(0, 50),
res.content
);
const uploadedFile = await uploadFile(localStorage.token, file).catch((e) => {
toast.error(`${e}`);
return null;
});
if (uploadedFile) {
console.log(uploadedFile);
fileItems = fileItems.map((item) => {
if (item.itemId === fileItem.itemId) {
item.id = uploadedFile.id;
}
return item;
});
if (uploadedFile.error) {
console.warn('File upload warning:', uploadedFile.error);
toast.warning(uploadedFile.error);
fileItems = fileItems.filter((file) => file.id !== uploadedFile.id);
} else {
await addFileHandler(uploadedFile.id);
}
} else {
toast.error($i18n.t('Failed to upload file.'));
}
} else {
// remove the item from fileItems
fileItems = fileItems.filter((item) => item.itemId !== fileItem.itemId);
toast.error($i18n.t('Failed to process URL: {{url}}', { url: fileItem.url }));
}
} catch (e) {
// remove the item from fileItems
fileItems = fileItems.filter((item) => item.itemId !== fileItem.itemId);
toast.error(`${e}`);
}
}
};
const uploadFileHandler = async (file) => { const uploadFileHandler = async (file) => {
console.log(file); console.log(file);
const tempItemId = uuidv4();
const fileItem = { const fileItem = {
type: 'file', type: 'file',
file: '', file: '',
@ -182,7 +259,7 @@
size: file.size, size: file.size,
status: 'uploading', status: 'uploading',
error: '', error: '',
itemId: tempItemId itemId: uuidv4()
}; };
if (fileItem.size == 0) { if (fileItem.size == 0) {
@ -206,7 +283,7 @@
return; return;
} }
fileItems = [...(fileItems ?? []), fileItem]; fileItems = [fileItem, ...(fileItems ?? [])];
try { try {
let metadata = { let metadata = {
knowledge_id: knowledge.id, knowledge_id: knowledge.id,
@ -227,12 +304,9 @@
if (uploadedFile) { if (uploadedFile) {
console.log(uploadedFile); console.log(uploadedFile);
fileItems = fileItems.map((item) => { fileItems = fileItems.map((item) => {
if (item.itemId === tempItemId) { if (item.itemId === fileItem.itemId) {
item.id = uploadedFile.id; item.id = uploadedFile.id;
} }
// Remove temporary item id
delete item.itemId;
return item; return item;
}); });
@ -701,8 +775,8 @@
<AttachWebpageModal <AttachWebpageModal
bind:show={showAddWebpageModal} bind:show={showAddWebpageModal}
onSubmit={async (data) => { onSubmit={async (e) => {
console.log(data); uploadWeb(e.data);
}} }}
/> />

View file

@ -25,7 +25,7 @@
</script> </script>
<div class=" max-h-full flex flex-col w-full gap-[0.5px]"> <div class=" max-h-full flex flex-col w-full gap-[0.5px]">
{#each files as file (file?.id ?? file?.tempId)} {#each files as file (file?.id ?? file?.itemId ?? file?.tempId)}
<div <div
class=" flex cursor-pointer w-full px-1.5 py-0.5 bg-transparent dark:hover:bg-gray-850/50 hover:bg-white rounded-xl transition {selectedFileId class=" flex cursor-pointer w-full px-1.5 py-0.5 bg-transparent dark:hover:bg-gray-850/50 hover:bg-white rounded-xl transition {selectedFileId
? '' ? ''
@ -59,24 +59,29 @@
</div> </div>
<div class="flex items-center gap-2 shrink-0"> <div class="flex items-center gap-2 shrink-0">
<Tooltip content={dayjs(file.updated_at * 1000).format('LLLL')}> {#if file?.updated_at}
<div> <Tooltip content={dayjs(file.updated_at * 1000).format('LLLL')}>
{dayjs(file.updated_at * 1000).fromNow()} <div>
</div> {dayjs(file.updated_at * 1000).fromNow()}
</Tooltip> </div>
<Tooltip </Tooltip>
content={file?.user?.email ?? $i18n.t('Deleted User')} {/if}
className="flex shrink-0"
placement="top-start" {#if file?.user}
> <Tooltip
<div class="shrink-0 text-gray-500"> content={file?.user?.email ?? $i18n.t('Deleted User')}
{$i18n.t('By {{name}}', { className="flex shrink-0"
name: capitalizeFirstLetter( placement="top-start"
file?.user?.name ?? file?.user?.email ?? $i18n.t('Deleted User') >
) <div class="shrink-0 text-gray-500">
})} {$i18n.t('By {{name}}', {
</div> name: capitalizeFirstLetter(
</Tooltip> file?.user?.name ?? file?.user?.email ?? $i18n.t('Deleted User')
)
})}
</div>
</Tooltip>
{/if}
</div> </div>
</button> </button>