mirror of
https://github.com/sourcebot-dev/sourcebot.git
synced 2025-12-12 04:15:30 +00:00
feat(worker): Add ALWAYS_INDEX_FILE_PATTERNS env var to specify files that should always be indexed (#631)
Some checks are pending
Publish to ghcr / build (linux/amd64, blacksmith-4vcpu-ubuntu-2404) (push) Waiting to run
Publish to ghcr / build (linux/arm64, blacksmith-8vcpu-ubuntu-2204-arm) (push) Waiting to run
Publish to ghcr / merge (push) Blocked by required conditions
Update Roadmap Released / update (push) Waiting to run
Some checks are pending
Publish to ghcr / build (linux/amd64, blacksmith-4vcpu-ubuntu-2404) (push) Waiting to run
Publish to ghcr / build (linux/arm64, blacksmith-8vcpu-ubuntu-2204-arm) (push) Waiting to run
Publish to ghcr / merge (push) Blocked by required conditions
Update Roadmap Released / update (push) Waiting to run
This commit is contained in:
parent
c962fdd636
commit
8bc4f1e520
5 changed files with 31 additions and 1 deletions
|
|
@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- Added `ALWAYS_INDEX_FILE_PATTERNS` environment variable to allow specifying a comma seperated list of glob patterns matching file paths that should always be indexed, regardless of size or # of trigrams. [#631](https://github.com/sourcebot-dev/sourcebot/pull/631)
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- Fixed issue where single quotes could not be used in search queries. [#629](https://github.com/sourcebot-dev/sourcebot/pull/629)
|
- Fixed issue where single quotes could not be used in search queries. [#629](https://github.com/sourcebot-dev/sourcebot/pull/629)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -35,6 +35,7 @@ The following environment variables allow you to configure your Sourcebot deploy
|
||||||
| `SOURCEBOT_STRUCTURED_LOGGING_FILE` | - | <p>Optional file to log to if structured logging is enabled</p> |
|
| `SOURCEBOT_STRUCTURED_LOGGING_FILE` | - | <p>Optional file to log to if structured logging is enabled</p> |
|
||||||
| `SOURCEBOT_TELEMETRY_DISABLED` | `false` | <p>Enables/disables telemetry collection in Sourcebot. See [this doc](/docs/overview.mdx#telemetry) for more info.</p> |
|
| `SOURCEBOT_TELEMETRY_DISABLED` | `false` | <p>Enables/disables telemetry collection in Sourcebot. See [this doc](/docs/overview.mdx#telemetry) for more info.</p> |
|
||||||
| `DEFAULT_MAX_MATCH_COUNT` | `10000` | <p>The default maximum number of search results to return when using search in the web app.</p> |
|
| `DEFAULT_MAX_MATCH_COUNT` | `10000` | <p>The default maximum number of search results to return when using search in the web app.</p> |
|
||||||
|
| `ALWAYS_INDEX_FILE_PATTERNS` | - | <p>A comma separated list of glob patterns matching file paths that should always be indexed, regardless of size or number of trigrams.</p> |
|
||||||
|
|
||||||
### Enterprise Environment Variables
|
### Enterprise Environment Variables
|
||||||
| Variable | Default | Description |
|
| Variable | Default | Description |
|
||||||
|
|
|
||||||
|
|
@ -69,6 +69,26 @@ To learn more about how to create a connection for a specific code host, check o
|
||||||
|
|
||||||
<Note>Missing your code host? [Submit a feature request on GitHub](https://github.com/sourcebot-dev/sourcebot/issues/new?template=feature_request.md).</Note>
|
<Note>Missing your code host? [Submit a feature request on GitHub](https://github.com/sourcebot-dev/sourcebot/issues/new?template=feature_request.md).</Note>
|
||||||
|
|
||||||
|
## Indexing Large Files
|
||||||
|
|
||||||
|
By default, Sourcebot will skip indexing files that are larger than 2MB or have more than 20,000 trigrams. You can configure this by setting the `maxFileSize` and `maxTrigramCount` [settings](/docs/configuration/config-file#settings).
|
||||||
|
|
||||||
|
These limits can be ignored for specific files by passing in a comma separated list of glob patterns matching file paths to the `ALWAYS_INDEX_FILE_PATTERNS` environment variable. For example:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Always index all .sum and .lock files
|
||||||
|
ALWAYS_INDEX_FILE_PATTERNS=**/*.sum,**/*.lock
|
||||||
|
```
|
||||||
|
|
||||||
|
Files that have been skipped are assigned the `skipped` language. You can view a list of all skipped files by using the following query:
|
||||||
|
```
|
||||||
|
lang:skipped
|
||||||
|
```
|
||||||
|
|
||||||
|
## Indexing Binary Files
|
||||||
|
|
||||||
|
Binary files cannot be indexed by Sourcebot. See [#575](https://github.com/sourcebot-dev/sourcebot/issues/575) for more information.
|
||||||
|
|
||||||
|
|
||||||
## Schema reference
|
## Schema reference
|
||||||
---
|
---
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
import { Repo } from "@sourcebot/db";
|
import { Repo } from "@sourcebot/db";
|
||||||
import { createLogger } from "@sourcebot/shared";
|
import { createLogger, env } from "@sourcebot/shared";
|
||||||
import { exec } from "child_process";
|
import { exec } from "child_process";
|
||||||
import { INDEX_CACHE_DIR } from "./constants.js";
|
import { INDEX_CACHE_DIR } from "./constants.js";
|
||||||
import { Settings } from "./types.js";
|
import { Settings } from "./types.js";
|
||||||
|
|
@ -11,6 +11,8 @@ export const indexGitRepository = async (repo: Repo, settings: Settings, revisio
|
||||||
const { path: repoPath } = getRepoPath(repo);
|
const { path: repoPath } = getRepoPath(repo);
|
||||||
const shardPrefix = getShardPrefix(repo.orgId, repo.id);
|
const shardPrefix = getShardPrefix(repo.orgId, repo.id);
|
||||||
|
|
||||||
|
const largeFileGlobPatterns = env.ALWAYS_INDEX_FILE_PATTERNS?.split(',').map(pattern => pattern.trim()) ?? [];
|
||||||
|
|
||||||
const command = [
|
const command = [
|
||||||
'zoekt-git-index',
|
'zoekt-git-index',
|
||||||
'-allow_missing_branches',
|
'-allow_missing_branches',
|
||||||
|
|
@ -21,6 +23,7 @@ export const indexGitRepository = async (repo: Repo, settings: Settings, revisio
|
||||||
`-tenant_id ${repo.orgId}`,
|
`-tenant_id ${repo.orgId}`,
|
||||||
`-repo_id ${repo.id}`,
|
`-repo_id ${repo.id}`,
|
||||||
`-shard_prefix ${shardPrefix}`,
|
`-shard_prefix ${shardPrefix}`,
|
||||||
|
...largeFileGlobPatterns.map((pattern) => `-large_file ${pattern}`),
|
||||||
repoPath
|
repoPath
|
||||||
].join(' ');
|
].join(' ');
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -219,6 +219,9 @@ export const env = createEnv({
|
||||||
|
|
||||||
// Configure the default maximum number of search results to return by default.
|
// Configure the default maximum number of search results to return by default.
|
||||||
DEFAULT_MAX_MATCH_COUNT: numberSchema.default(10_000),
|
DEFAULT_MAX_MATCH_COUNT: numberSchema.default(10_000),
|
||||||
|
|
||||||
|
// A comma separated list of glob patterns that shwould always be indexed regardless of their size.
|
||||||
|
ALWAYS_INDEX_FILE_PATTERNS: z.string().optional(),
|
||||||
},
|
},
|
||||||
runtimeEnv,
|
runtimeEnv,
|
||||||
emptyStringAsUndefined: true,
|
emptyStringAsUndefined: true,
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue