feat: dynamically load pdfjs

This commit is contained in:
Shirasawa 2025-09-05 17:55:04 +08:00
parent c0a47169fa
commit 49e045ea3d
2 changed files with 20 additions and 14 deletions

View file

@ -1,8 +1,4 @@
<script lang="ts"> <script lang="ts">
import * as pdfjs from 'pdfjs-dist';
import * as pdfWorker from 'pdfjs-dist/build/pdf.worker.mjs';
pdfjs.GlobalWorkerOptions.workerSrc = import.meta.url + 'pdfjs-dist/build/pdf.worker.mjs';
import DOMPurify from 'dompurify'; import DOMPurify from 'dompurify';
import { marked } from 'marked'; import { marked } from 'marked';
import heic2any from 'heic2any'; import heic2any from 'heic2any';
@ -640,7 +636,7 @@
} else { } else {
// If temporary chat is enabled, we just add the file to the list without uploading it. // If temporary chat is enabled, we just add the file to the list without uploading it.
const content = await extractContentFromFile(file, pdfjsLib).catch((error) => { const content = await extractContentFromFile(file).catch((error) => {
toast.error( toast.error(
$i18n.t('Failed to extract content from the file: {{error}}', { error: error }) $i18n.t('Failed to extract content from the file: {{error}}', { error: error })
); );

View file

@ -15,6 +15,8 @@ dayjs.extend(localizedFormat);
import { TTS_RESPONSE_SPLIT } from '$lib/types'; import { TTS_RESPONSE_SPLIT } from '$lib/types';
import pdfWorkerUrl from 'pdfjs-dist/build/pdf.worker.mjs?url';
import { marked } from 'marked'; import { marked } from 'marked';
import markedExtension from '$lib/utils/marked/extension'; import markedExtension from '$lib/utils/marked/extension';
import markedKatexExtension from '$lib/utils/marked/katex-extension'; import markedKatexExtension from '$lib/utils/marked/katex-extension';
@ -1440,7 +1442,18 @@ export const parseJsonValue = (value: string): any => {
return value; return value;
}; };
export const extractContentFromFile = async (file, pdfjsLib = null) => { async function ensurePDFjsLoaded() {
if (!window.pdfjsLib) {
const pdfjs = await import('pdfjs-dist');
pdfjs.GlobalWorkerOptions.workerSrc = pdfWorkerUrl;
if (!window.pdfjsLib) {
throw new Error('pdfjsLib is required for PDF extraction');
}
}
return window.pdfjsLib;
}
export const extractContentFromFile = async (file: File) => {
// Known text file extensions for extra fallback // Known text file extensions for extra fallback
const textExtensions = [ const textExtensions = [
'.txt', '.txt',
@ -1457,31 +1470,28 @@ export const extractContentFromFile = async (file, pdfjsLib = null) => {
'.rtf' '.rtf'
]; ];
function getExtension(filename) { function getExtension(filename: string) {
const dot = filename.lastIndexOf('.'); const dot = filename.lastIndexOf('.');
return dot === -1 ? '' : filename.substr(dot).toLowerCase(); return dot === -1 ? '' : filename.substr(dot).toLowerCase();
} }
// Uses pdfjs to extract text from PDF // Uses pdfjs to extract text from PDF
async function extractPdfText(file) { async function extractPdfText(file: File) {
if (!pdfjsLib) { const pdfjsLib = await ensurePDFjsLoaded();
throw new Error('pdfjsLib is required for PDF extraction');
}
const arrayBuffer = await file.arrayBuffer(); const arrayBuffer = await file.arrayBuffer();
const pdf = await pdfjsLib.getDocument({ data: arrayBuffer }).promise; const pdf = await pdfjsLib.getDocument({ data: arrayBuffer }).promise;
let allText = ''; let allText = '';
for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) { for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
const page = await pdf.getPage(pageNum); const page = await pdf.getPage(pageNum);
const content = await page.getTextContent(); const content = await page.getTextContent();
const strings = content.items.map((item) => item.str); const strings = content.items.map((item: any) => item.str);
allText += strings.join(' ') + '\n'; allText += strings.join(' ') + '\n';
} }
return allText; return allText;
} }
// Reads file as text using FileReader // Reads file as text using FileReader
function readAsText(file) { function readAsText(file: File) {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
const reader = new FileReader(); const reader = new FileReader();
reader.onload = () => resolve(reader.result); reader.onload = () => resolve(reader.result);