diff --git a/.env.development b/.env.development index 0309b5fb..1740c2df 100644 --- a/.env.development +++ b/.env.development @@ -4,8 +4,6 @@ DATABASE_URL="postgresql://postgres:postgres@localhost:5432/postgres" # Zoekt ZOEKT_WEBSERVER_URL="http://localhost:6070" -# SHARD_MAX_MATCH_COUNT=10000 -# TOTAL_MAX_MATCH_COUNT=100000 # The command to use for generating ctags. CTAGS_COMMAND=ctags # logging, strict diff --git a/.github/workflows/deploy-demo.yml b/.github/workflows/deploy-demo.yml index 71bc9ef0..50884bef 100644 --- a/.github/workflows/deploy-demo.yml +++ b/.github/workflows/deploy-demo.yml @@ -2,7 +2,7 @@ name: Deploy Demo on: push: - branches: ["main"] + tags: ["v*.*.*"] workflow_dispatch: jobs: diff --git a/.github/workflows/ghcr-publish.yml b/.github/workflows/ghcr-publish.yml index 6e443966..cf96bea7 100644 --- a/.github/workflows/ghcr-publish.yml +++ b/.github/workflows/ghcr-publish.yml @@ -27,9 +27,9 @@ jobs: platform: [linux/amd64, linux/arm64] include: - platform: linux/amd64 - runs-on: ubuntu-latest + runs-on: blacksmith-4vcpu-ubuntu-2404 - platform: linux/arm64 - runs-on: ubuntu-24.04-arm + runs-on: blacksmith-8vcpu-ubuntu-2204-arm steps: - name: Prepare @@ -57,8 +57,8 @@ jobs: with: cosign-release: "v2.2.4" - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + - name: Setup Blacksmith Builder + uses: useblacksmith/setup-docker-builder@v1 - name: Login to GitHub Packages Docker Registry uses: docker/login-action@v3 @@ -69,12 +69,10 @@ jobs: - name: Build Docker image id: build - uses: docker/build-push-action@v6 + uses: useblacksmith/build-push-action@v2 with: context: . labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha,scope=${{ env.PLATFORM_PAIR }} - cache-to: type=gha,mode=max,scope=${{ env.PLATFORM_PAIR }} platforms: ${{ matrix.platform }} outputs: type=image,name=${{ env.REGISTRY_IMAGE }},push-by-digest=true,name-canonical=true,push=true,annotation.org.opencontainers.image.description=Blazingly fast code search build-args: | @@ -110,7 +108,7 @@ jobs: run: echo "${TAGS}" | xargs -I {} cosign sign --yes {}@${DIGEST} merge: - runs-on: ubuntu-latest + runs-on: blacksmith-4vcpu-ubuntu-2404 permissions: packages: write needs: @@ -123,8 +121,8 @@ jobs: pattern: digests-* merge-multiple: true - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + - name: Setup Blacksmith Builder + uses: useblacksmith/setup-docker-builder@v1 - name: Extract Docker metadata id: meta diff --git a/.github/workflows/pr-gate.yml b/.github/workflows/pr-gate.yml index de59790f..12750446 100644 --- a/.github/workflows/pr-gate.yml +++ b/.github/workflows/pr-gate.yml @@ -8,7 +8,7 @@ on: jobs: build: - runs-on: ubuntu-latest + runs-on: blacksmith-4vcpu-ubuntu-2404 permissions: contents: read steps: @@ -19,6 +19,6 @@ jobs: - name: Build Docker image id: build - uses: docker/build-push-action@v6 + uses: useblacksmith/build-push-action@v2 with: context: . diff --git a/.github/workflows/test-backend.yml b/.github/workflows/test-backend.yml index 5d4b5fde..37ea3e77 100644 --- a/.github/workflows/test-backend.yml +++ b/.github/workflows/test-backend.yml @@ -7,7 +7,7 @@ on: jobs: build: - runs-on: ubuntu-latest + runs-on: blacksmith-4vcpu-ubuntu-2404 permissions: contents: read steps: diff --git a/.github/workflows/test-web.yml b/.github/workflows/test-web.yml index 108cc277..006ba0e5 100644 --- a/.github/workflows/test-web.yml +++ b/.github/workflows/test-web.yml @@ -7,7 +7,7 @@ on: jobs: build: - runs-on: ubuntu-latest + runs-on: blacksmith-4vcpu-ubuntu-2404 permissions: contents: read steps: diff --git a/CHANGELOG.md b/CHANGELOG.md index 89530b9e..82b854fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] + + +### Fixed +- Fixed "dubious ownership" errors when cloning / fetching repos. [#553](https://github.com/sourcebot-dev/sourcebot/pull/553) + +### Changed +- Remove spam "login page loaded" log. [#552](https://github.com/sourcebot-dev/sourcebot/pull/552) +- Improved search performance for unbounded search queries. [#555](https://github.com/sourcebot-dev/sourcebot/pull/555) + ### Added - Added support for passing db connection url as seperate `DATABASE_HOST`, `DATABASE_USERNAME`, `DATABASE_PASSWORD`, `DATABASE_NAME`, and `DATABASE_ARGS` env vars. [#545](https://github.com/sourcebot-dev/sourcebot/pull/545) diff --git a/Dockerfile b/Dockerfile index 010f5940..95a8f825 100644 --- a/Dockerfile +++ b/Dockerfile @@ -233,6 +233,9 @@ COPY --from=shared-libs-builder /app/packages/shared ./packages/shared # Configure dependencies RUN apk add --no-cache git ca-certificates bind-tools tini jansson wget supervisor uuidgen curl perl jq redis postgresql postgresql-contrib openssl util-linux unzip +# Fixes git "dubious ownership" issues when the volume is mounted with different permissions to the container. +RUN git config --global safe.directory "*" + # Configure the database RUN mkdir -p /run/postgresql && \ chown -R postgres:postgres /run/postgresql && \ diff --git a/docs/docs/configuration/environment-variables.mdx b/docs/docs/configuration/environment-variables.mdx index d49073fd..a51aeb37 100644 --- a/docs/docs/configuration/environment-variables.mdx +++ b/docs/docs/configuration/environment-variables.mdx @@ -28,7 +28,6 @@ The following environment variables allow you to configure your Sourcebot deploy | `REDIS_REMOVE_ON_FAIL` | `100` |
Controls how many failed jobs are allowed to remain in Redis queues
| | `REPO_SYNC_RETRY_BASE_SLEEP_SECONDS` | `60` |The base sleep duration (in seconds) for exponential backoff when retrying repository sync operations that fail
| | `GITLAB_CLIENT_QUERY_TIMEOUT_SECONDS` | `600` |The timeout duration (in seconds) for GitLab client queries
| -| `SHARD_MAX_MATCH_COUNT` | `10000` |The maximum shard count per query
| | `SMTP_CONNECTION_URL` | `-` |The url to the SMTP service used for sending transactional emails. See [this doc](/docs/configuration/transactional-emails) for more info.
| | `SOURCEBOT_ENCRYPTION_KEY` | Automatically generated at startup if no value is provided. Generated using `openssl rand -base64 24` |Used to encrypt connection secrets and generate API keys.
| | `SOURCEBOT_PUBLIC_KEY_PATH` | `/app/public.pem` |Sourcebot's public key that's used to verify encrypted license key signatures.
| @@ -36,8 +35,6 @@ The following environment variables allow you to configure your Sourcebot deploy | `SOURCEBOT_STRUCTURED_LOGGING_ENABLED` | `false` |Enables/disable structured JSON logging. See [this doc](/docs/configuration/structured-logging) for more info.
| | `SOURCEBOT_STRUCTURED_LOGGING_FILE` | - |Optional file to log to if structured logging is enabled
| | `SOURCEBOT_TELEMETRY_DISABLED` | `false` |Enables/disables telemetry collection in Sourcebot. See [this doc](/docs/overview.mdx#telemetry) for more info.
| -| `TOTAL_MAX_MATCH_COUNT` | `100000` |The maximum number of matches per query
| -| `ZOEKT_MAX_WALL_TIME_MS` | `10000` |The maximum real world duration (in milliseconds) per zoekt query
| ### Enterprise Environment Variables | Variable | Default | Description | diff --git a/docs/docs/overview.mdx b/docs/docs/overview.mdx index 96d9bc85..15a42dcb 100644 --- a/docs/docs/overview.mdx +++ b/docs/docs/overview.mdx @@ -2,10 +2,11 @@ title: "Overview" --- -[Sourcebot](https://github.com/sourcebot-dev/sourcebot) is a self-hosted tool that helps you understand your codebase. +[Sourcebot](https://github.com/sourcebot-dev/sourcebot) is a platform that helps humans and agents understand your codebase: - [Code search](/docs/features/search/overview): Search and navigate across all your repos and branches, no matter where they’re hosted - [Ask Sourcebot](/docs/features/ask): Ask questions about your codebase and have Sourcebot provide detailed answers grounded with inline citations +- [MCP](/docs/features/mcp-server): Enrich agent context windows with code across your organizationSearching...
Failed to search
+{error.message}
+Search stats for nerds
+{`[${searchDurationMs} ms] Found ${numMatches} matches in ${fileMatches.length} ${fileMatches.length > 1 ? 'files' : 'file'}`}
diff --git a/packages/web/src/app/components/codeSnippet.tsx b/packages/web/src/app/components/codeSnippet.tsx index 93ca4de3..e77a24b7 100644 --- a/packages/web/src/app/components/codeSnippet.tsx +++ b/packages/web/src/app/components/codeSnippet.tsx @@ -1,12 +1,12 @@ import { cn } from "@/lib/utils" -export const CodeSnippet = ({ children, className, title }: { children: React.ReactNode, className?: string, title?: string }) => { +export const CodeSnippet = ({ children, className, title, renderNewlines = false }: { children: React.ReactNode, className?: string, title?: string, renderNewlines?: boolean }) => { return (
- {children}
+ {renderNewlines ? {children} : children}
)
}
\ No newline at end of file
diff --git a/packages/web/src/app/login/page.tsx b/packages/web/src/app/login/page.tsx
index f83ac939..1535ec58 100644
--- a/packages/web/src/app/login/page.tsx
+++ b/packages/web/src/app/login/page.tsx
@@ -2,13 +2,10 @@ import { auth } from "@/auth";
import { LoginForm } from "./components/loginForm";
import { redirect } from "next/navigation";
import { Footer } from "@/app/components/footer";
-import { createLogger } from "@sourcebot/logger";
import { getAuthProviders } from "@/lib/authProviders";
import { getOrgFromDomain } from "@/data/org";
import { SINGLE_TENANT_ORG_DOMAIN } from "@/lib/constants";
-const logger = createLogger('login-page');
-
interface LoginProps {
searchParams: Promise<{
callbackUrl?: string;
@@ -18,10 +15,8 @@ interface LoginProps {
export default async function Login(props: LoginProps) {
const searchParams = await props.searchParams;
- logger.info("Login page loaded");
const session = await auth();
if (session) {
- logger.info("Session found in login page, redirecting to home");
return redirect("/");
}
diff --git a/packages/web/src/env.mjs b/packages/web/src/env.mjs
index 922b2b84..7a9c1589 100644
--- a/packages/web/src/env.mjs
+++ b/packages/web/src/env.mjs
@@ -15,9 +15,6 @@ export const env = createEnv({
server: {
// Zoekt
ZOEKT_WEBSERVER_URL: z.string().url().default("http://localhost:6070"),
- SHARD_MAX_MATCH_COUNT: numberSchema.default(10000),
- TOTAL_MAX_MATCH_COUNT: numberSchema.default(100000),
- ZOEKT_MAX_WALL_TIME_MS: numberSchema.default(10000),
// Auth
FORCE_ENABLE_ANONYMOUS_ACCESS: booleanSchema.default('false'),
diff --git a/packages/web/src/features/codeNav/actions.ts b/packages/web/src/features/codeNav/actions.ts
index b55cfa30..839ef381 100644
--- a/packages/web/src/features/codeNav/actions.ts
+++ b/packages/web/src/features/codeNav/actions.ts
@@ -80,7 +80,7 @@ export const findSearchBasedSymbolDefinitions = async (
const parseRelatedSymbolsSearchResponse = (searchResult: SearchResponse) => {
const parser = searchResponseSchema.transform(async ({ files }) => ({
stats: {
- matchCount: searchResult.stats.matchCount,
+ matchCount: searchResult.stats.actualMatchCount,
},
files: files.flatMap((file) => {
const chunks = file.chunks;
diff --git a/packages/web/src/features/search/schemas.ts b/packages/web/src/features/search/schemas.ts
index 18dfd8d4..1867d849 100644
--- a/packages/web/src/features/search/schemas.ts
+++ b/packages/web/src/features/search/schemas.ts
@@ -37,35 +37,82 @@ export const repositoryInfoSchema = z.object({
name: z.string(),
displayName: z.string().optional(),
webUrl: z.string().optional(),
-})
+});
+
+// Many of these fields are defined in zoekt/api.go.
+export const searchStatsSchema = z.object({
+ // The actual number of matches returned by the search.
+ // This will always be less than or equal to `totalMatchCount`.
+ actualMatchCount: z.number(),
+
+ // The total number of matches found during the search.
+ totalMatchCount: z.number(),
+
+ // The duration (in nanoseconds) of the search.
+ duration: z.number(),
+
+ // Number of files containing a match.
+ fileCount: z.number(),
+
+ // Candidate files whose contents weren't examined because we
+ // gathered enough matches.
+ filesSkipped: z.number(),
+
+ // Amount of I/O for reading contents.
+ contentBytesLoaded: z.number(),
+
+ // Amount of I/O for reading from index.
+ indexBytesLoaded: z.number(),
+
+ // Number of search shards that had a crash.
+ crashes: z.number(),
+
+ // Number of files in shards that we considered.
+ shardFilesConsidered: z.number(),
+
+ // Files that we evaluated. Equivalent to files for which all
+ // atom matches (including negations) evaluated to true.
+ filesConsidered: z.number(),
+
+ // Files for which we loaded file content to verify substring matches
+ filesLoaded: z.number(),
+
+ // Shards that we scanned to find matches.
+ shardsScanned: z.number(),
+
+ // Shards that we did not process because a query was canceled.
+ shardsSkipped: z.number(),
+
+ // Shards that we did not process because the query was rejected by the
+ // ngram filter indicating it had no matches.
+ shardsSkippedFilter: z.number(),
+
+ // Number of candidate matches as a result of searching ngrams.
+ ngramMatches: z.number(),
+
+ // NgramLookups is the number of times we accessed an ngram in the index.
+ ngramLookups: z.number(),
+
+ // Wall clock time for queued search.
+ wait: z.number(),
+
+ // Aggregate wall clock time spent constructing and pruning the match tree.
+ // This accounts for time such as lookups in the trigram index.
+ matchTreeConstruction: z.number(),
+
+ // Aggregate wall clock time spent searching the match tree. This accounts
+ // for the bulk of search work done looking for matches.
+ matchTreeSearch: z.number(),
+
+ // Number of times regexp was called on files that we evaluated.
+ regexpsConsidered: z.number(),
+
+ // FlushReason explains why results were flushed.
+ flushReason: z.number(),
+});
export const searchResponseSchema = z.object({
- zoektStats: z.object({
- // The duration (in nanoseconds) of the search.
- duration: z.number(),
- fileCount: z.number(),
- matchCount: z.number(),
- filesSkipped: z.number(),
- contentBytesLoaded: z.number(),
- indexBytesLoaded: z.number(),
- crashes: z.number(),
- shardFilesConsidered: z.number(),
- filesConsidered: z.number(),
- filesLoaded: z.number(),
- shardsScanned: z.number(),
- shardsSkipped: z.number(),
- shardsSkippedFilter: z.number(),
- ngramMatches: z.number(),
- ngramLookups: z.number(),
- wait: z.number(),
- matchTreeConstruction: z.number(),
- matchTreeSearch: z.number(),
- regexpsConsidered: z.number(),
- flushReason: z.number(),
- }),
- stats: z.object({
- matchCount: z.number(),
- }),
+ stats: searchStatsSchema,
files: z.array(z.object({
fileName: z.object({
// The name of the file
@@ -92,6 +139,7 @@ export const searchResponseSchema = z.object({
})),
repositoryInfo: z.array(repositoryInfoSchema),
isBranchFilteringEnabled: z.boolean(),
+ isSearchExhaustive: z.boolean(),
});
export const fileSourceRequestSchema = z.object({
diff --git a/packages/web/src/features/search/searchApi.ts b/packages/web/src/features/search/searchApi.ts
index 60d04dec..3c7ea373 100644
--- a/packages/web/src/features/search/searchApi.ts
+++ b/packages/web/src/features/search/searchApi.ts
@@ -151,12 +151,48 @@ export const search = async ({ query, matches, contextLines, whole }: SearchRequ
// @see: https://github.com/sourcebot-dev/zoekt/blob/main/api.go#L892
opts: {
ChunkMatches: true,
+ // @note: Zoekt has several different ways to limit a given search. The two that
+ // we care about are `MaxMatchDisplayCount` and `TotalMaxMatchCount`:
+ // - `MaxMatchDisplayCount` truncates the number of matches AFTER performing
+ // a search (specifically, after collating and sorting the results). The number of
+ // results returned by the API will be less than or equal to this value.
+ //
+ // - `TotalMaxMatchCount` truncates the number of matches DURING a search. The results
+ // returned by the API the API can be less than, equal to, or greater than this value.
+ // Why greater? Because this value is compared _after_ a given shard has finished
+ // being processed, the number of matches returned by the last shard may have exceeded
+ // this value.
+ //
+ // Let's define two variables:
+ // - `actualMatchCount` : The number of matches that are returned by the API. This is
+ // always less than or equal to `MaxMatchDisplayCount`.
+ // - `totalMatchCount` : The number of matches that zoekt found before it either
+ // 1) found all matches or 2) hit the `TotalMaxMatchCount` limit. This number is
+ // not bounded and can be less than, equal to, or greater than both `TotalMaxMatchCount`
+ // and `MaxMatchDisplayCount`.
+ //
+ //
+ // Our challenge is to determine whether or not the search returned all possible matches/
+ // (it was exaustive) or if it was truncated. By setting the `TotalMaxMatchCount` to
+ // `MaxMatchDisplayCount + 1`, we can determine which of these occurred by comparing
+ // `totalMatchCount` to `MaxMatchDisplayCount`.
+ //
+ // if (totalMatchCount ≤ actualMatchCount):
+ // Search is EXHAUSTIVE (found all possible matches)
+ // Proof: totalMatchCount ≤ MaxMatchDisplayCount < TotalMaxMatchCount
+ // Therefore Zoekt stopped naturally, not due to limit
+ //
+ // if (totalMatchCount > actualMatchCount):
+ // Search is TRUNCATED (more matches exist)
+ // Proof: totalMatchCount > MaxMatchDisplayCount + 1 = TotalMaxMatchCount
+ // Therefore Zoekt hit the limit and stopped searching
+ //
MaxMatchDisplayCount: matches,
+ TotalMaxMatchCount: matches + 1,
NumContextLines: contextLines,
Whole: !!whole,
- TotalMaxMatchCount: env.TOTAL_MAX_MATCH_COUNT,
- ShardMaxMatchCount: env.SHARD_MAX_MATCH_COUNT,
- MaxWallTime: env.ZOEKT_MAX_WALL_TIME_MS * 1000 * 1000, // zoekt expects a duration in nanoseconds
+ ShardMaxMatchCount: -1,
+ MaxWallTime: 0, // zoekt expects a duration in nanoseconds
}
});
@@ -296,11 +332,35 @@ export const search = async ({ query, matches, contextLines, whole }: SearchRequ
}
}).filter((file) => file !== undefined) ?? [];
+ const actualMatchCount = files.reduce(
+ (acc, file) =>
+ // Match count is the sum of the number of chunk matches and file name matches.
+ acc + file.chunks.reduce(
+ (acc, chunk) => acc + chunk.matchRanges.length,
+ 0,
+ ) + file.fileName.matchRanges.length,
+ 0,
+ );
+
+ const totalMatchCount = Result.MatchCount;
+ const isSearchExhaustive = totalMatchCount <= actualMatchCount;
+
return {
- zoektStats: {
+ files,
+ repositoryInfo: Array.from(repos.values()).map((repo) => ({
+ id: repo.id,
+ codeHostType: repo.external_codeHostType,
+ name: repo.name,
+ displayName: repo.displayName ?? undefined,
+ webUrl: repo.webUrl ?? undefined,
+ })),
+ isBranchFilteringEnabled,
+ isSearchExhaustive,
+ stats: {
+ actualMatchCount,
+ totalMatchCount,
duration: Result.Duration,
fileCount: Result.FileCount,
- matchCount: Result.MatchCount,
filesSkipped: Result.FilesSkipped,
contentBytesLoaded: Result.ContentBytesLoaded,
indexBytesLoaded: Result.IndexBytesLoaded,
@@ -318,25 +378,6 @@ export const search = async ({ query, matches, contextLines, whole }: SearchRequ
matchTreeSearch: Result.MatchTreeSearch,
regexpsConsidered: Result.RegexpsConsidered,
flushReason: Result.FlushReason,
- },
- files,
- repositoryInfo: Array.from(repos.values()).map((repo) => ({
- id: repo.id,
- codeHostType: repo.external_codeHostType,
- name: repo.name,
- displayName: repo.displayName ?? undefined,
- webUrl: repo.webUrl ?? undefined,
- })),
- isBranchFilteringEnabled: isBranchFilteringEnabled,
- stats: {
- matchCount: files.reduce(
- (acc, file) =>
- acc + file.chunks.reduce(
- (acc, chunk) => acc + chunk.matchRanges.length,
- 0,
- ),
- 0,
- )
}
} satisfies SearchResponse;
});
diff --git a/packages/web/src/features/search/types.ts b/packages/web/src/features/search/types.ts
index f9af8dbe..2a238857 100644
--- a/packages/web/src/features/search/types.ts
+++ b/packages/web/src/features/search/types.ts
@@ -8,6 +8,7 @@ import {
fileSourceRequestSchema,
symbolSchema,
repositoryInfoSchema,
+ searchStatsSchema,
} from "./schemas";
import { z } from "zod";
@@ -22,4 +23,5 @@ export type FileSourceRequest = z.infer