feat(worker): Add ALWAYS_INDEX_FILE_PATTERNS env var to specify files that should always be indexed (#631)
Some checks are pending
Publish to ghcr / build (linux/amd64, blacksmith-4vcpu-ubuntu-2404) (push) Waiting to run
Publish to ghcr / build (linux/arm64, blacksmith-8vcpu-ubuntu-2204-arm) (push) Waiting to run
Publish to ghcr / merge (push) Blocked by required conditions
Update Roadmap Released / update (push) Waiting to run

This commit is contained in:
Brendan Kellam 2025-11-25 23:38:30 -08:00 committed by GitHub
parent c962fdd636
commit 8bc4f1e520
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 31 additions and 1 deletions

View file

@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Added
- Added `ALWAYS_INDEX_FILE_PATTERNS` environment variable to allow specifying a comma seperated list of glob patterns matching file paths that should always be indexed, regardless of size or # of trigrams. [#631](https://github.com/sourcebot-dev/sourcebot/pull/631)
### Fixed
- Fixed issue where single quotes could not be used in search queries. [#629](https://github.com/sourcebot-dev/sourcebot/pull/629)

View file

@ -35,6 +35,7 @@ The following environment variables allow you to configure your Sourcebot deploy
| `SOURCEBOT_STRUCTURED_LOGGING_FILE` | - | <p>Optional file to log to if structured logging is enabled</p> |
| `SOURCEBOT_TELEMETRY_DISABLED` | `false` | <p>Enables/disables telemetry collection in Sourcebot. See [this doc](/docs/overview.mdx#telemetry) for more info.</p> |
| `DEFAULT_MAX_MATCH_COUNT` | `10000` | <p>The default maximum number of search results to return when using search in the web app.</p> |
| `ALWAYS_INDEX_FILE_PATTERNS` | - | <p>A comma separated list of glob patterns matching file paths that should always be indexed, regardless of size or number of trigrams.</p> |
### Enterprise Environment Variables
| Variable | Default | Description |

View file

@ -69,6 +69,26 @@ To learn more about how to create a connection for a specific code host, check o
<Note>Missing your code host? [Submit a feature request on GitHub](https://github.com/sourcebot-dev/sourcebot/issues/new?template=feature_request.md).</Note>
## Indexing Large Files
By default, Sourcebot will skip indexing files that are larger than 2MB or have more than 20,000 trigrams. You can configure this by setting the `maxFileSize` and `maxTrigramCount` [settings](/docs/configuration/config-file#settings).
These limits can be ignored for specific files by passing in a comma separated list of glob patterns matching file paths to the `ALWAYS_INDEX_FILE_PATTERNS` environment variable. For example:
```bash
# Always index all .sum and .lock files
ALWAYS_INDEX_FILE_PATTERNS=**/*.sum,**/*.lock
```
Files that have been skipped are assigned the `skipped` language. You can view a list of all skipped files by using the following query:
```
lang:skipped
```
## Indexing Binary Files
Binary files cannot be indexed by Sourcebot. See [#575](https://github.com/sourcebot-dev/sourcebot/issues/575) for more information.
## Schema reference
---

View file

@ -1,5 +1,5 @@
import { Repo } from "@sourcebot/db";
import { createLogger } from "@sourcebot/shared";
import { createLogger, env } from "@sourcebot/shared";
import { exec } from "child_process";
import { INDEX_CACHE_DIR } from "./constants.js";
import { Settings } from "./types.js";
@ -11,6 +11,8 @@ export const indexGitRepository = async (repo: Repo, settings: Settings, revisio
const { path: repoPath } = getRepoPath(repo);
const shardPrefix = getShardPrefix(repo.orgId, repo.id);
const largeFileGlobPatterns = env.ALWAYS_INDEX_FILE_PATTERNS?.split(',').map(pattern => pattern.trim()) ?? [];
const command = [
'zoekt-git-index',
'-allow_missing_branches',
@ -21,6 +23,7 @@ export const indexGitRepository = async (repo: Repo, settings: Settings, revisio
`-tenant_id ${repo.orgId}`,
`-repo_id ${repo.id}`,
`-shard_prefix ${shardPrefix}`,
...largeFileGlobPatterns.map((pattern) => `-large_file ${pattern}`),
repoPath
].join(' ');

View file

@ -219,6 +219,9 @@ export const env = createEnv({
// Configure the default maximum number of search results to return by default.
DEFAULT_MAX_MATCH_COUNT: numberSchema.default(10_000),
// A comma separated list of glob patterns that shwould always be indexed regardless of their size.
ALWAYS_INDEX_FILE_PATTERNS: z.string().optional(),
},
runtimeEnv,
emptyStringAsUndefined: true,