'use server'; import { env } from "@/env.mjs"; import { invalidZoektResponse, ServiceError } from "../../lib/serviceError"; import { isServiceError } from "../../lib/utils"; import { zoektFetch } from "./zoektClient"; import { ErrorCode } from "../../lib/errorCodes"; import { StatusCodes } from "http-status-codes"; import { zoektSearchResponseSchema } from "./zoektSchema"; import { SearchRequest, SearchResponse, SourceRange } from "./types"; import { PrismaClient, Repo } from "@sourcebot/db"; import { sew } from "@/actions"; import { base64Decode } from "@sourcebot/shared"; import { withOptionalAuthV2 } from "@/withAuthV2"; // List of supported query prefixes in zoekt. // @see : https://github.com/sourcebot-dev/zoekt/blob/main/query/parse.go#L417 enum zoektPrefixes { archived = "archived:", branchShort = "b:", branch = "branch:", caseShort = "c:", case = "case:", content = "content:", fileShort = "f:", file = "file:", fork = "fork:", public = "public:", repoShort = "r:", repo = "repo:", regex = "regex:", lang = "lang:", sym = "sym:", typeShort = "t:", type = "type:", reposet = "reposet:", } const transformZoektQuery = async (query: string, orgId: number, prisma: PrismaClient): Promise => { const prevQueryParts = query.split(" "); const newQueryParts = []; for (const part of prevQueryParts) { // Handle mapping `rev:` and `revision:` to `branch:` if (part.match(/^-?(rev|revision):.+$/)) { const isNegated = part.startsWith("-"); let revisionName = part.slice(part.indexOf(":") + 1); // Special case: `*` -> search all revisions. // In zoekt, providing a blank string will match all branches. // @see: https://github.com/sourcebot-dev/zoekt/blob/main/eval.go#L560-L562 if (revisionName === "*") { revisionName = ""; } newQueryParts.push(`${isNegated ? "-" : ""}${zoektPrefixes.branch}${revisionName}`); } // Expand `context:` into `reposet:` atom. else if (part.match(/^-?context:.+$/)) { const isNegated = part.startsWith("-"); const contextName = part.slice(part.indexOf(":") + 1); const context = await prisma.searchContext.findUnique({ where: { name_orgId: { name: contextName, orgId, } }, include: { repos: true, } }); // If the context doesn't exist, return an error. if (!context) { return { errorCode: ErrorCode.SEARCH_CONTEXT_NOT_FOUND, message: `Search context "${contextName}" not found`, statusCode: StatusCodes.NOT_FOUND, } satisfies ServiceError; } const names = context.repos.map((repo) => repo.name); newQueryParts.push(`${isNegated ? "-" : ""}${zoektPrefixes.reposet}${names.join(",")}`); } // no-op: add the original part to the new query parts. else { newQueryParts.push(part); } } return newQueryParts.join(" "); } // Extracts a repository file URL from a zoekt template, branch, and file name. const getFileWebUrl = (template: string, branch: string, fileName: string): string | undefined => { // This is a hacky parser for templates generated by // the go text/template package. Example template: // {{URLJoinPath "https://github.com/sourcebot-dev/sourcebot" "blob" .Version .Path}} if (!template.match(/^{{URLJoinPath\s.*}}(\?.+)?$/)) { return undefined; } const url = template.substring("{{URLJoinPath ".length, template.indexOf("}}")) .split(" ") .map((part) => { // remove wrapping quotes if (part.startsWith("\"")) part = part.substring(1); if (part.endsWith("\"")) part = part.substring(0, part.length - 1); // Replace variable references if (part == ".Version") part = branch; if (part == ".Path") part = fileName; return part; }) .join("/"); const optionalQueryParams = template.substring(template.indexOf("}}") + 2) .replace("{{.Version}}", branch) .replace("{{.Path}}", fileName); return encodeURI(url + optionalQueryParams); } export const search = async ({ query, matches, contextLines, whole }: SearchRequest) => sew(() => withOptionalAuthV2(async ({ org, prisma }) => { const transformedQuery = await transformZoektQuery(query, org.id, prisma); if (isServiceError(transformedQuery)) { return transformedQuery; } query = transformedQuery; const isBranchFilteringEnabled = ( query.includes(zoektPrefixes.branch) || query.includes(zoektPrefixes.branchShort) ); // We only want to show matches for the default branch when // the user isn't explicitly filtering by branch. if (!isBranchFilteringEnabled) { query = query.concat(` branch:HEAD`); } const body = JSON.stringify({ q: query, // @see: https://github.com/sourcebot-dev/zoekt/blob/main/api.go#L892 opts: { ChunkMatches: true, MaxMatchDisplayCount: matches, NumContextLines: contextLines, Whole: !!whole, TotalMaxMatchCount: env.TOTAL_MAX_MATCH_COUNT, ShardMaxMatchCount: env.SHARD_MAX_MATCH_COUNT, MaxWallTime: env.ZOEKT_MAX_WALL_TIME_MS * 1000 * 1000, // zoekt expects a duration in nanoseconds } }); let header: Record = {}; header = { "X-Tenant-ID": org.id.toString() }; const searchResponse = await zoektFetch({ path: "/api/search", body, header, method: "POST", }); if (!searchResponse.ok) { return invalidZoektResponse(searchResponse); } const searchBody = await searchResponse.json(); const parser = zoektSearchResponseSchema.transform(async ({ Result }) => { // @note (2025-05-12): in zoekt, repositories are identified by the `RepositoryID` field // which corresponds to the `id` in the Repo table. In order to efficiently fetch repository // metadata when transforming (potentially thousands) of file matches, we aggregate a unique // set of repository ids* and map them to their corresponding Repo record. // // *Q: Why is `RepositoryID` optional? And why are we falling back to `Repository`? // A: Prior to this change, the repository id was not plumbed into zoekt, so RepositoryID was // always undefined. To make this a non-breaking change, we fallback to using the repository's name // (`Repository`) as the identifier in these cases. This is not guaranteed to be unique, but in // practice it is since the repository name includes the host and path (e.g., 'github.com/org/repo', // 'gitea.com/org/repo', etc.). // // Note: When a repository is re-indexed (every hour) this ID will be populated. // @see: https://github.com/sourcebot-dev/zoekt/pull/6 const repoIdentifiers = new Set(Result.Files?.map((file) => file.RepositoryID ?? file.Repository) ?? []); const repos = new Map(); (await prisma.repo.findMany({ where: { id: { in: Array.from(repoIdentifiers).filter((id) => typeof id === "number"), }, orgId: org.id, } })).forEach(repo => repos.set(repo.id, repo)); (await prisma.repo.findMany({ where: { name: { in: Array.from(repoIdentifiers).filter((id) => typeof id === "string"), }, orgId: org.id, } })).forEach(repo => repos.set(repo.name, repo)); const files = Result.Files?.map((file) => { const fileNameChunks = file.ChunkMatches.filter((chunk) => chunk.FileName); const webUrl = (() => { const template: string | undefined = Result.RepoURLs[file.Repository]; if (!template) { return undefined; } // If there are multiple branches pointing to the same revision of this file, it doesn't // matter which branch we use here, so use the first one. const branch = file.Branches && file.Branches.length > 0 ? file.Branches[0] : "HEAD"; return getFileWebUrl(template, branch, file.FileName); })(); const identifier = file.RepositoryID ?? file.Repository; const repo = repos.get(identifier); // This can happen if the user doesn't have access to the repository. if (!repo) { return undefined; } return { fileName: { text: file.FileName, matchRanges: fileNameChunks.length === 1 ? fileNameChunks[0].Ranges.map((range) => ({ start: { byteOffset: range.Start.ByteOffset, column: range.Start.Column, lineNumber: range.Start.LineNumber, }, end: { byteOffset: range.End.ByteOffset, column: range.End.Column, lineNumber: range.End.LineNumber, } })) : [], }, repository: repo.name, repositoryId: repo.id, webUrl: webUrl, language: file.Language, chunks: file.ChunkMatches .filter((chunk) => !chunk.FileName) // Filter out filename chunks. .map((chunk) => { return { content: base64Decode(chunk.Content), matchRanges: chunk.Ranges.map((range) => ({ start: { byteOffset: range.Start.ByteOffset, column: range.Start.Column, lineNumber: range.Start.LineNumber, }, end: { byteOffset: range.End.ByteOffset, column: range.End.Column, lineNumber: range.End.LineNumber, } }) satisfies SourceRange), contentStart: { byteOffset: chunk.ContentStart.ByteOffset, column: chunk.ContentStart.Column, lineNumber: chunk.ContentStart.LineNumber, }, symbols: chunk.SymbolInfo?.map((symbol) => { return { symbol: symbol.Sym, kind: symbol.Kind, parent: symbol.Parent.length > 0 ? { symbol: symbol.Parent, kind: symbol.ParentKind, } : undefined, } }) ?? undefined, } }), branches: file.Branches, content: file.Content ? base64Decode(file.Content) : undefined, } }).filter((file) => file !== undefined) ?? []; return { zoektStats: { duration: Result.Duration, fileCount: Result.FileCount, matchCount: Result.MatchCount, filesSkipped: Result.FilesSkipped, contentBytesLoaded: Result.ContentBytesLoaded, indexBytesLoaded: Result.IndexBytesLoaded, crashes: Result.Crashes, shardFilesConsidered: Result.ShardFilesConsidered, filesConsidered: Result.FilesConsidered, filesLoaded: Result.FilesLoaded, shardsScanned: Result.ShardsScanned, shardsSkipped: Result.ShardsSkipped, shardsSkippedFilter: Result.ShardsSkippedFilter, ngramMatches: Result.NgramMatches, ngramLookups: Result.NgramLookups, wait: Result.Wait, matchTreeConstruction: Result.MatchTreeConstruction, matchTreeSearch: Result.MatchTreeSearch, regexpsConsidered: Result.RegexpsConsidered, flushReason: Result.FlushReason, }, files, repositoryInfo: Array.from(repos.values()).map((repo) => ({ id: repo.id, codeHostType: repo.external_codeHostType, name: repo.name, displayName: repo.displayName ?? undefined, webUrl: repo.webUrl ?? undefined, })), isBranchFilteringEnabled: isBranchFilteringEnabled, stats: { matchCount: files.reduce( (acc, file) => acc + file.chunks.reduce( (acc, chunk) => acc + chunk.matchRanges.length, 0, ), 0, ) } } satisfies SearchResponse; }); return parser.parseAsync(searchBody); }));