mirror of
https://github.com/sourcebot-dev/sourcebot.git
synced 2025-12-11 20:05:25 +00:00
feat(worker): Add ALWAYS_INDEX_FILE_PATTERNS env var to specify files that should always be indexed (#631)
Some checks are pending
Publish to ghcr / build (linux/amd64, blacksmith-4vcpu-ubuntu-2404) (push) Waiting to run
Publish to ghcr / build (linux/arm64, blacksmith-8vcpu-ubuntu-2204-arm) (push) Waiting to run
Publish to ghcr / merge (push) Blocked by required conditions
Update Roadmap Released / update (push) Waiting to run
Some checks are pending
Publish to ghcr / build (linux/amd64, blacksmith-4vcpu-ubuntu-2404) (push) Waiting to run
Publish to ghcr / build (linux/arm64, blacksmith-8vcpu-ubuntu-2204-arm) (push) Waiting to run
Publish to ghcr / merge (push) Blocked by required conditions
Update Roadmap Released / update (push) Waiting to run
This commit is contained in:
parent
c962fdd636
commit
8bc4f1e520
5 changed files with 31 additions and 1 deletions
|
|
@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- Added `ALWAYS_INDEX_FILE_PATTERNS` environment variable to allow specifying a comma seperated list of glob patterns matching file paths that should always be indexed, regardless of size or # of trigrams. [#631](https://github.com/sourcebot-dev/sourcebot/pull/631)
|
||||
|
||||
### Fixed
|
||||
- Fixed issue where single quotes could not be used in search queries. [#629](https://github.com/sourcebot-dev/sourcebot/pull/629)
|
||||
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ The following environment variables allow you to configure your Sourcebot deploy
|
|||
| `SOURCEBOT_STRUCTURED_LOGGING_FILE` | - | <p>Optional file to log to if structured logging is enabled</p> |
|
||||
| `SOURCEBOT_TELEMETRY_DISABLED` | `false` | <p>Enables/disables telemetry collection in Sourcebot. See [this doc](/docs/overview.mdx#telemetry) for more info.</p> |
|
||||
| `DEFAULT_MAX_MATCH_COUNT` | `10000` | <p>The default maximum number of search results to return when using search in the web app.</p> |
|
||||
| `ALWAYS_INDEX_FILE_PATTERNS` | - | <p>A comma separated list of glob patterns matching file paths that should always be indexed, regardless of size or number of trigrams.</p> |
|
||||
|
||||
### Enterprise Environment Variables
|
||||
| Variable | Default | Description |
|
||||
|
|
|
|||
|
|
@ -69,6 +69,26 @@ To learn more about how to create a connection for a specific code host, check o
|
|||
|
||||
<Note>Missing your code host? [Submit a feature request on GitHub](https://github.com/sourcebot-dev/sourcebot/issues/new?template=feature_request.md).</Note>
|
||||
|
||||
## Indexing Large Files
|
||||
|
||||
By default, Sourcebot will skip indexing files that are larger than 2MB or have more than 20,000 trigrams. You can configure this by setting the `maxFileSize` and `maxTrigramCount` [settings](/docs/configuration/config-file#settings).
|
||||
|
||||
These limits can be ignored for specific files by passing in a comma separated list of glob patterns matching file paths to the `ALWAYS_INDEX_FILE_PATTERNS` environment variable. For example:
|
||||
|
||||
```bash
|
||||
# Always index all .sum and .lock files
|
||||
ALWAYS_INDEX_FILE_PATTERNS=**/*.sum,**/*.lock
|
||||
```
|
||||
|
||||
Files that have been skipped are assigned the `skipped` language. You can view a list of all skipped files by using the following query:
|
||||
```
|
||||
lang:skipped
|
||||
```
|
||||
|
||||
## Indexing Binary Files
|
||||
|
||||
Binary files cannot be indexed by Sourcebot. See [#575](https://github.com/sourcebot-dev/sourcebot/issues/575) for more information.
|
||||
|
||||
|
||||
## Schema reference
|
||||
---
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import { Repo } from "@sourcebot/db";
|
||||
import { createLogger } from "@sourcebot/shared";
|
||||
import { createLogger, env } from "@sourcebot/shared";
|
||||
import { exec } from "child_process";
|
||||
import { INDEX_CACHE_DIR } from "./constants.js";
|
||||
import { Settings } from "./types.js";
|
||||
|
|
@ -11,6 +11,8 @@ export const indexGitRepository = async (repo: Repo, settings: Settings, revisio
|
|||
const { path: repoPath } = getRepoPath(repo);
|
||||
const shardPrefix = getShardPrefix(repo.orgId, repo.id);
|
||||
|
||||
const largeFileGlobPatterns = env.ALWAYS_INDEX_FILE_PATTERNS?.split(',').map(pattern => pattern.trim()) ?? [];
|
||||
|
||||
const command = [
|
||||
'zoekt-git-index',
|
||||
'-allow_missing_branches',
|
||||
|
|
@ -21,6 +23,7 @@ export const indexGitRepository = async (repo: Repo, settings: Settings, revisio
|
|||
`-tenant_id ${repo.orgId}`,
|
||||
`-repo_id ${repo.id}`,
|
||||
`-shard_prefix ${shardPrefix}`,
|
||||
...largeFileGlobPatterns.map((pattern) => `-large_file ${pattern}`),
|
||||
repoPath
|
||||
].join(' ');
|
||||
|
||||
|
|
|
|||
|
|
@ -219,6 +219,9 @@ export const env = createEnv({
|
|||
|
||||
// Configure the default maximum number of search results to return by default.
|
||||
DEFAULT_MAX_MATCH_COUNT: numberSchema.default(10_000),
|
||||
|
||||
// A comma separated list of glob patterns that shwould always be indexed regardless of their size.
|
||||
ALWAYS_INDEX_FILE_PATTERNS: z.string().optional(),
|
||||
},
|
||||
runtimeEnv,
|
||||
emptyStringAsUndefined: true,
|
||||
|
|
|
|||
Loading…
Reference in a new issue