sourcebot/packages/web/src/features/search/searchApi.ts

import { env } from "@/env.mjs";
import { invalidZoektResponse, ServiceError } from "../../lib/serviceError";
import { isServiceError } from "../../lib/utils";
import { zoektFetch } from "./zoektClient";
import { prisma } from "@/prisma";
import { ErrorCode } from "../../lib/errorCodes";
import { StatusCodes } from "http-status-codes";
import { zoektSearchResponseSchema } from "./zoektSchema";
import { SearchRequest, SearchResponse, SearchResultRange } from "./types";
import assert from "assert";

// List of supported query prefixes in zoekt.
// @see : https://github.com/sourcebot-dev/zoekt/blob/main/query/parse.go#L417
enum zoektPrefixes {
    archived = "archived:",
    branchShort = "b:",
    branch = "branch:",
    caseShort = "c:",
    case = "case:",
    content = "content:",
    fileShort = "f:",
    file = "file:",
    fork = "fork:",
    public = "public:",
    repoShort = "r:",
    repo = "repo:",
    regex = "regex:",
    lang = "lang:",
    sym = "sym:",
    typeShort = "t:",
    type = "type:",
    reposet = "reposet:",
}

const transformZoektQuery = async (query: string, orgId: number): Promise<string | ServiceError> => {
    const prevQueryParts = query.split(" ");
    const newQueryParts = [];

    for (const part of prevQueryParts) {

        // Handle mapping `rev:` and `revision:` to `branch:`
        if (part.match(/^-?(rev|revision):.+$/)) {
            const isNegated = part.startsWith("-");
            let revisionName = part.slice(part.indexOf(":") + 1);

            // Special case: `*` -> search all revisions.
            // In zoekt, providing a blank string will match all branches.
            // @see: https://github.com/sourcebot-dev/zoekt/blob/main/eval.go#L560-L562
            if (revisionName === "*") {
                revisionName = "";
            }
            newQueryParts.push(`${isNegated ? "-" : ""}${zoektPrefixes.branch}${revisionName}`);
        }

        // Expand `context:` into `reposet:` atom.
        else if (part.match(/^-?context:.+$/)) {
            const isNegated = part.startsWith("-");
            const contextName = part.slice(part.indexOf(":") + 1);

            const context = await prisma.searchContext.findUnique({
                where: {
                    name_orgId: {
                        name: contextName,
                        orgId,
                    }
                },
                include: {
                    repos: true,
                }
            });

            // If the context doesn't exist, return an error.
            if (!context) {
                return {
                    errorCode: ErrorCode.SEARCH_CONTEXT_NOT_FOUND,
                    message: `Search context "${contextName}" not found`,
                    statusCode: StatusCodes.NOT_FOUND,
                } satisfies ServiceError;
            }

            const names = context.repos.map((repo) => repo.name);
            newQueryParts.push(`${isNegated ? "-" : ""}${zoektPrefixes.reposet}${names.join(",")}`);
        }

        // no-op: add the original part to the new query parts.
        else {
            newQueryParts.push(part);
        }
    }

    return newQueryParts.join(" ");
}

// Extracts a repository file URL from a zoekt template, branch, and file name.
function getRepositoryUrl(template: string, branch: string, fileName: string): string {
    // This is a hacky parser for templates generated by
    // the go text/template package. Example template:
    // {{URLJoinPath "https://github.com/sourcebot-dev/sourcebot" "blob" .Version .Path}}
    
    // The template should always match this regex, so let's assert that.
    assert(template.match(/^{{URLJoinPath\s.*}}(\?.+)?$/), "Invalid template");

    const url =
        template.substring("{{URLJoinPath ".length, template.indexOf("}}"))
        .replace(".Version", branch)
        .replace(".Path", fileName)
        .split(" ")
        .map((part) => {
            // remove wrapping quotes
            if (part.startsWith("\"")) part = part.substring(1);
            if (part.endsWith("\"")) part = part.substring(0, part.length - 1);
            return part;
        })
        .join("/");

    const optionalQueryParams =
        template.substring(template.indexOf("}}") + 2)
        .replace("{{.Version}}", branch)
        .replace("{{.Path}}", fileName);

    return encodeURI(url + optionalQueryParams);
}

export const search = async ({ query, matches, contextLines, whole }: SearchRequest, orgId: number) => {
    const transformedQuery = await transformZoektQuery(query, orgId);
    if (isServiceError(transformedQuery)) {
        return transformedQuery;
    }
    query = transformedQuery;

    const isBranchFilteringEnabled = (
        query.includes(zoektPrefixes.branch) ||
        query.includes(zoektPrefixes.branchShort)
    );

    // We only want to show matches for the default branch when
    // the user isn't explicitly filtering by branch.
    if (!isBranchFilteringEnabled) {
        query = query.concat(` branch:HEAD`);
    }

    const body = JSON.stringify({
        q: query,
        // @see: https://github.com/sourcebot-dev/zoekt/blob/main/api.go#L892
        opts: {
            ChunkMatches: true,
            MaxMatchDisplayCount: matches,
            NumContextLines: contextLines,
            Whole: !!whole,
            TotalMaxMatchCount: env.TOTAL_MAX_MATCH_COUNT,
            ShardMaxMatchCount: env.SHARD_MAX_MATCH_COUNT,
            MaxWallTime: env.ZOEKT_MAX_WALL_TIME_MS * 1000 * 1000, // zoekt expects a duration in nanoseconds
        }
    });

    let header: Record<string, string> = {};
    header = {
        "X-Tenant-ID": orgId.toString()
    };

    const searchResponse = await zoektFetch({
        path: "/api/search",
        body,
        header,
        method: "POST",
    });

    if (!searchResponse.ok) {
        return invalidZoektResponse(searchResponse);
    }

    const searchBody = await searchResponse.json();

    const parser = zoektSearchResponseSchema.transform(({ Result }) => ({
        zoektStats: {
            duration: Result.Duration,
            fileCount: Result.FileCount,
            matchCount: Result.MatchCount,
            filesSkipped: Result.FilesSkipped,
            contentBytesLoaded: Result.ContentBytesLoaded,
            indexBytesLoaded: Result.IndexBytesLoaded,
            crashes: Result.Crashes,
            shardFilesConsidered: Result.ShardFilesConsidered,
            filesConsidered: Result.FilesConsidered,
            filesLoaded: Result.FilesLoaded,
            shardsScanned: Result.ShardsScanned,
            shardsSkipped: Result.ShardsSkipped,
            shardsSkippedFilter: Result.ShardsSkippedFilter,
            ngramMatches: Result.NgramMatches,
            ngramLookups: Result.NgramLookups,
            wait: Result.Wait,
            matchTreeConstruction: Result.MatchTreeConstruction,
            matchTreeSearch: Result.MatchTreeSearch,
            regexpsConsidered: Result.RegexpsConsidered,
            flushReason: Result.FlushReason,
        },
        files: Result.Files?.map((file) => {
            const fileNameChunks = file.ChunkMatches.filter((chunk) => chunk.FileName);

            const template = Result.RepoURLs[file.Repository];
            assert(template, `Template not found for repository ${file.Repository}`);

            // If there are multiple branches pointing to the same revision of this file, it doesn't
            // matter which branch we use here, so use the first one.
            const branch = file.Branches && file.Branches.length > 0 ? file.Branches[0] : "HEAD";
            const url = getRepositoryUrl(template, branch, file.FileName);

            return {
                fileName: {
                    text: file.FileName,
                    matchRanges: fileNameChunks.length === 1 ? fileNameChunks[0].Ranges.map((range) => ({
                        start: {
                            byteOffset: range.Start.ByteOffset,
                            column: range.Start.Column,
                            lineNumber: range.Start.LineNumber,
                        },
                        end: {
                            byteOffset: range.End.ByteOffset,
                            column: range.End.Column,
                            lineNumber: range.End.LineNumber,
                        }
                    })) : [],
                },
                repository: file.Repository,
                url: url,
                language: file.Language,
                chunks: file.ChunkMatches
                    .filter((chunk) => !chunk.FileName) // Filter out filename chunks.
                    .map((chunk) => {
                        return {
                            content: chunk.Content,
                            matchRanges: chunk.Ranges.map((range) => ({
                                start: {
                                    byteOffset: range.Start.ByteOffset,
                                    column: range.Start.Column,
                                    lineNumber: range.Start.LineNumber,
                                },
                                end: {
                                    byteOffset: range.End.ByteOffset,
                                    column: range.End.Column,
                                    lineNumber: range.End.LineNumber,
                                }
                            }) satisfies SearchResultRange),
                            contentStart: {
                                byteOffset: chunk.ContentStart.ByteOffset,
                                column: chunk.ContentStart.Column,
                                lineNumber: chunk.ContentStart.LineNumber,
                            },
                            symbols: chunk.SymbolInfo?.map((symbol) => {
                                return {
                                    symbol: symbol.Sym,
                                    kind: symbol.Kind,
                                    parent: symbol.Parent.length > 0 ? {
                                        symbol: symbol.Parent,
                                        kind: symbol.ParentKind,
                                    } : undefined,
                                }
                            }) ?? undefined,
                        }
                    }),
                branches: file.Branches,
                content: file.Content,
            }
        }) ?? [],
        isBranchFilteringEnabled: isBranchFilteringEnabled,
    } satisfies SearchResponse));

    return parser.parse(searchBody);
}
chore: Sourcebot REST api surface (#290) 2025-05-03 18:33:58 +00:00			`import { env } from "@/env.mjs";`
			`import { invalidZoektResponse, ServiceError } from "../../lib/serviceError";`
			`import { isServiceError } from "../../lib/utils";`
			`import { zoektFetch } from "./zoektClient";`
			`import { prisma } from "@/prisma";`
			`import { ErrorCode } from "../../lib/errorCodes";`
			`import { StatusCodes } from "http-status-codes";`
			`import { zoektSearchResponseSchema } from "./zoektSchema";`
			`import { SearchRequest, SearchResponse, SearchResultRange } from "./types";`
Sourcebot MCP (#292) 2025-05-07 23:21:05 +00:00			`import assert from "assert";`
chore: Sourcebot REST api surface (#290) 2025-05-03 18:33:58 +00:00
			`// List of supported query prefixes in zoekt.`
			`// @see : https://github.com/sourcebot-dev/zoekt/blob/main/query/parse.go#L417`
			`enum zoektPrefixes {`
			`archived = "archived:",`
			`branchShort = "b:",`
			`branch = "branch:",`
			`caseShort = "c:",`
			`case = "case:",`
			`content = "content:",`
			`fileShort = "f:",`
			`file = "file:",`
			`fork = "fork:",`
			`public = "public:",`
			`repoShort = "r:",`
			`repo = "repo:",`
			`regex = "regex:",`
			`lang = "lang:",`
			`sym = "sym:",`
			`typeShort = "t:",`
			`type = "type:",`
			`reposet = "reposet:",`
			`}`

			`const transformZoektQuery = async (query: string, orgId: number): Promise<string \| ServiceError> => {`
			`const prevQueryParts = query.split(" ");`
			`const newQueryParts = [];`

			`for (const part of prevQueryParts) {`

			// Handle mapping `rev:` and `revision:` to `branch:`
			`if (part.match(/^-?(rev\|revision):.+$/)) {`
			`const isNegated = part.startsWith("-");`
			`let revisionName = part.slice(part.indexOf(":") + 1);`

			// Special case: `*` -> search all revisions.
			`// In zoekt, providing a blank string will match all branches.`
			`// @see: https://github.com/sourcebot-dev/zoekt/blob/main/eval.go#L560-L562`
			`if (revisionName === "*") {`
			`revisionName = "";`
			`}`
			newQueryParts.push(`${isNegated ? "-" : ""}${zoektPrefixes.branch}${revisionName}`);
			`}`

			// Expand `context:` into `reposet:` atom.
			`else if (part.match(/^-?context:.+$/)) {`
			`const isNegated = part.startsWith("-");`
			`const contextName = part.slice(part.indexOf(":") + 1);`

			`const context = await prisma.searchContext.findUnique({`
			`where: {`
			`name_orgId: {`
			`name: contextName,`
			`orgId,`
			`}`
			`},`
			`include: {`
			`repos: true,`
			`}`
			`});`

			`// If the context doesn't exist, return an error.`
			`if (!context) {`
			`return {`
			`errorCode: ErrorCode.SEARCH_CONTEXT_NOT_FOUND,`
			message: `Search context "${contextName}" not found`,
			`statusCode: StatusCodes.NOT_FOUND,`
			`} satisfies ServiceError;`
			`}`

			`const names = context.repos.map((repo) => repo.name);`
			newQueryParts.push(`${isNegated ? "-" : ""}${zoektPrefixes.reposet}${names.join(",")}`);
			`}`

			`// no-op: add the original part to the new query parts.`
			`else {`
			`newQueryParts.push(part);`
			`}`
			`}`

			`return newQueryParts.join(" ");`
			`}`

Sourcebot MCP (#292) 2025-05-07 23:21:05 +00:00			`// Extracts a repository file URL from a zoekt template, branch, and file name.`
			`function getRepositoryUrl(template: string, branch: string, fileName: string): string {`
			`// This is a hacky parser for templates generated by`
			`// the go text/template package. Example template:`
			`// {{URLJoinPath "https://github.com/sourcebot-dev/sourcebot" "blob" .Version .Path}}`

			`// The template should always match this regex, so let's assert that.`
			`assert(template.match(/^{{URLJoinPath\s.*}}(\?.+)?$/), "Invalid template");`

			`const url =`
			`template.substring("{{URLJoinPath ".length, template.indexOf("}}"))`
			`.replace(".Version", branch)`
			`.replace(".Path", fileName)`
			`.split(" ")`
			`.map((part) => {`
			`// remove wrapping quotes`
			`if (part.startsWith("\"")) part = part.substring(1);`
			`if (part.endsWith("\"")) part = part.substring(0, part.length - 1);`
			`return part;`
			`})`
			`.join("/");`

			`const optionalQueryParams =`
			`template.substring(template.indexOf("}}") + 2)`
			`.replace("{{.Version}}", branch)`
			`.replace("{{.Path}}", fileName);`

			`return encodeURI(url + optionalQueryParams);`
			`}`

chore: Sourcebot REST api surface (#290) 2025-05-03 18:33:58 +00:00			`export const search = async ({ query, matches, contextLines, whole }: SearchRequest, orgId: number) => {`
			`const transformedQuery = await transformZoektQuery(query, orgId);`
			`if (isServiceError(transformedQuery)) {`
			`return transformedQuery;`
			`}`
			`query = transformedQuery;`

			`const isBranchFilteringEnabled = (`
			`query.includes(zoektPrefixes.branch) \|\|`
			`query.includes(zoektPrefixes.branchShort)`
			`);`

			`// We only want to show matches for the default branch when`
			`// the user isn't explicitly filtering by branch.`
			`if (!isBranchFilteringEnabled) {`
			query = query.concat(` branch:HEAD`);
			`}`

			`const body = JSON.stringify({`
			`q: query,`
			`// @see: https://github.com/sourcebot-dev/zoekt/blob/main/api.go#L892`
			`opts: {`
			`ChunkMatches: true,`
			`MaxMatchDisplayCount: matches,`
			`NumContextLines: contextLines,`
			`Whole: !!whole,`
			`TotalMaxMatchCount: env.TOTAL_MAX_MATCH_COUNT,`
			`ShardMaxMatchCount: env.SHARD_MAX_MATCH_COUNT,`
			`MaxWallTime: env.ZOEKT_MAX_WALL_TIME_MS * 1000 * 1000, // zoekt expects a duration in nanoseconds`
			`}`
			`});`

			`let header: Record<string, string> = {};`
			`header = {`
			`"X-Tenant-ID": orgId.toString()`
			`};`

			`const searchResponse = await zoektFetch({`
			`path: "/api/search",`
			`body,`
			`header,`
			`method: "POST",`
			`});`

			`if (!searchResponse.ok) {`
			`return invalidZoektResponse(searchResponse);`
			`}`

			`const searchBody = await searchResponse.json();`

			`const parser = zoektSearchResponseSchema.transform(({ Result }) => ({`
			`zoektStats: {`
			`duration: Result.Duration,`
			`fileCount: Result.FileCount,`
			`matchCount: Result.MatchCount,`
			`filesSkipped: Result.FilesSkipped,`
			`contentBytesLoaded: Result.ContentBytesLoaded,`
			`indexBytesLoaded: Result.IndexBytesLoaded,`
			`crashes: Result.Crashes,`
			`shardFilesConsidered: Result.ShardFilesConsidered,`
			`filesConsidered: Result.FilesConsidered,`
			`filesLoaded: Result.FilesLoaded,`
			`shardsScanned: Result.ShardsScanned,`
			`shardsSkipped: Result.ShardsSkipped,`
			`shardsSkippedFilter: Result.ShardsSkippedFilter,`
			`ngramMatches: Result.NgramMatches,`
			`ngramLookups: Result.NgramLookups,`
			`wait: Result.Wait,`
			`matchTreeConstruction: Result.MatchTreeConstruction,`
			`matchTreeSearch: Result.MatchTreeSearch,`
			`regexpsConsidered: Result.RegexpsConsidered,`
			`flushReason: Result.FlushReason,`
			`},`
			`files: Result.Files?.map((file) => {`
			`const fileNameChunks = file.ChunkMatches.filter((chunk) => chunk.FileName);`
Sourcebot MCP (#292) 2025-05-07 23:21:05 +00:00
			`const template = Result.RepoURLs[file.Repository];`
			assert(template, `Template not found for repository ${file.Repository}`);

			`// If there are multiple branches pointing to the same revision of this file, it doesn't`
			`// matter which branch we use here, so use the first one.`
			`const branch = file.Branches && file.Branches.length > 0 ? file.Branches[0] : "HEAD";`
			`const url = getRepositoryUrl(template, branch, file.FileName);`

chore: Sourcebot REST api surface (#290) 2025-05-03 18:33:58 +00:00			`return {`
			`fileName: {`
			`text: file.FileName,`
			`matchRanges: fileNameChunks.length === 1 ? fileNameChunks[0].Ranges.map((range) => ({`
			`start: {`
			`byteOffset: range.Start.ByteOffset,`
			`column: range.Start.Column,`
			`lineNumber: range.Start.LineNumber,`
			`},`
			`end: {`
			`byteOffset: range.End.ByteOffset,`
			`column: range.End.Column,`
			`lineNumber: range.End.LineNumber,`
			`}`
			`})) : [],`
			`},`
			`repository: file.Repository,`
Sourcebot MCP (#292) 2025-05-07 23:21:05 +00:00			`url: url,`
chore: Sourcebot REST api surface (#290) 2025-05-03 18:33:58 +00:00			`language: file.Language,`
			`chunks: file.ChunkMatches`
			`.filter((chunk) => !chunk.FileName) // Filter out filename chunks.`
			`.map((chunk) => {`
			`return {`
			`content: chunk.Content,`
			`matchRanges: chunk.Ranges.map((range) => ({`
			`start: {`
			`byteOffset: range.Start.ByteOffset,`
			`column: range.Start.Column,`
			`lineNumber: range.Start.LineNumber,`
			`},`
			`end: {`
			`byteOffset: range.End.ByteOffset,`
			`column: range.End.Column,`
			`lineNumber: range.End.LineNumber,`
			`}`
			`}) satisfies SearchResultRange),`
			`contentStart: {`
			`byteOffset: chunk.ContentStart.ByteOffset,`
			`column: chunk.ContentStart.Column,`
			`lineNumber: chunk.ContentStart.LineNumber,`
			`},`
			`symbols: chunk.SymbolInfo?.map((symbol) => {`
			`return {`
			`symbol: symbol.Sym,`
			`kind: symbol.Kind,`
			`parent: symbol.Parent.length > 0 ? {`
			`symbol: symbol.Parent,`
			`kind: symbol.ParentKind,`
			`} : undefined,`
			`}`
			`}) ?? undefined,`
			`}`
			`}),`
			`branches: file.Branches,`
			`content: file.Content,`
			`}`
Sourcebot MCP (#292) 2025-05-07 23:21:05 +00:00			`}) ?? [],`
chore: Sourcebot REST api surface (#290) 2025-05-03 18:33:58 +00:00			`isBranchFilteringEnabled: isBranchFilteringEnabled,`
			`} satisfies SearchResponse));`

			`return parser.parse(searchBody);`
			`}`