mirror of
https://github.com/sourcebot-dev/sourcebot.git
synced 2025-12-12 12:25:22 +00:00
Add config option to specify max file size (#118)
This commit is contained in:
parent
d4e6410b28
commit
111023b1dc
10 changed files with 202 additions and 29 deletions
|
|
@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
|
||||
- Added config option `settings.maxFileSize` to control the maximum file size zoekt will index. ([#118](https://github.com/sourcebot-dev/sourcebot/pull/118))
|
||||
|
||||
## [2.6.0] - 2024-12-02
|
||||
|
||||
### Added
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
import { Settings } from "./types.js";
|
||||
|
||||
/**
|
||||
* The interval to reindex a given repository.
|
||||
|
|
@ -8,3 +9,10 @@ export const REINDEX_INTERVAL_MS = 1000 * 60 * 60;
|
|||
* The interval to re-sync the config.
|
||||
*/
|
||||
export const RESYNC_CONFIG_INTERVAL_MS = 1000 * 60 * 60 * 24;
|
||||
|
||||
/**
|
||||
* Default settings.
|
||||
*/
|
||||
export const DEFAULT_SETTINGS: Settings = {
|
||||
maxFileSize: 2 * 1024 * 1024, // 2MB in bytes
|
||||
}
|
||||
32
packages/backend/src/db.test.ts
Normal file
32
packages/backend/src/db.test.ts
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
import { expect, test } from 'vitest';
|
||||
import { migration_addMaxFileSize, migration_addSettings, Schema } from './db';
|
||||
import { DEFAULT_SETTINGS } from './constants';
|
||||
import { DeepPartial } from './types';
|
||||
|
||||
|
||||
test('migration_addSettings adds the `settings` field with defaults if it does not exist', () => {
|
||||
const schema: DeepPartial<Schema> = {};
|
||||
|
||||
const migratedSchema = migration_addSettings(schema as Schema);
|
||||
expect(migratedSchema).toStrictEqual({
|
||||
settings: DEFAULT_SETTINGS,
|
||||
});
|
||||
});
|
||||
|
||||
test('migration_addMaxFileSize adds the `maxFileSize` field with the default value if it does not exist', () => {
|
||||
const schema: DeepPartial<Schema> = {
|
||||
settings: {},
|
||||
}
|
||||
|
||||
const migratedSchema = migration_addMaxFileSize(schema as Schema);
|
||||
expect(migratedSchema).toStrictEqual({
|
||||
settings: {
|
||||
maxFileSize: DEFAULT_SETTINGS.maxFileSize,
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
test('migration_addMaxFileSize will throw if `settings` is not defined', () => {
|
||||
const schema: DeepPartial<Schema> = {};
|
||||
expect(() => migration_addMaxFileSize(schema as Schema)).toThrow();
|
||||
});
|
||||
|
|
@ -1,8 +1,13 @@
|
|||
import { JSONFilePreset } from "lowdb/node";
|
||||
import { type Low } from "lowdb";
|
||||
import { AppContext, Repository } from "./types.js";
|
||||
import { AppContext, Repository, Settings } from "./types.js";
|
||||
import { DEFAULT_SETTINGS } from "./constants.js";
|
||||
import { createLogger } from "./logger.js";
|
||||
|
||||
type Schema = {
|
||||
const logger = createLogger('db');
|
||||
|
||||
export type Schema = {
|
||||
settings: Settings,
|
||||
repos: {
|
||||
[key: string]: Repository;
|
||||
}
|
||||
|
|
@ -11,9 +16,16 @@ type Schema = {
|
|||
export type Database = Low<Schema>;
|
||||
|
||||
export const loadDB = async (ctx: AppContext): Promise<Database> => {
|
||||
const db = await JSONFilePreset<Schema>(`${ctx.cachePath}/db.json`, { repos: {} });
|
||||
const db = await JSONFilePreset<Schema>(`${ctx.cachePath}/db.json`, {
|
||||
repos: {},
|
||||
settings: DEFAULT_SETTINGS,
|
||||
});
|
||||
|
||||
await applyMigrations(db);
|
||||
|
||||
return db;
|
||||
}
|
||||
|
||||
export const updateRepository = async (repoId: string, data: Repository, db: Database) => {
|
||||
db.data.repos[repoId] = {
|
||||
...db.data.repos[repoId],
|
||||
|
|
@ -22,7 +34,49 @@ export const updateRepository = async (repoId: string, data: Repository, db: Dat
|
|||
await db.write();
|
||||
}
|
||||
|
||||
export const updateSettings = async (settings: Settings, db: Database) => {
|
||||
db.data.settings = settings;
|
||||
await db.write();
|
||||
}
|
||||
|
||||
export const createRepository = async (repo: Repository, db: Database) => {
|
||||
db.data.repos[repo.id] = repo;
|
||||
await db.write();
|
||||
}
|
||||
|
||||
export const applyMigrations = async (db: Database) => {
|
||||
const log = (name: string) => {
|
||||
logger.info(`Applying migration '${name}'`);
|
||||
}
|
||||
|
||||
await db.update((schema) => {
|
||||
// @NOTE: please ensure new migrations are added after older ones!
|
||||
schema = migration_addSettings(schema, log);
|
||||
schema = migration_addMaxFileSize(schema, log);
|
||||
return schema;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* @see: https://github.com/sourcebot-dev/sourcebot/pull/118
|
||||
*/
|
||||
export const migration_addSettings = (schema: Schema, log?: (name: string) => void) => {
|
||||
if (!schema.settings) {
|
||||
log?.("addSettings");
|
||||
schema.settings = DEFAULT_SETTINGS;
|
||||
}
|
||||
|
||||
return schema;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see: https://github.com/sourcebot-dev/sourcebot/pull/118
|
||||
*/
|
||||
export const migration_addMaxFileSize = (schema: Schema, log?: (name: string) => void) => {
|
||||
if (!schema.settings.maxFileSize) {
|
||||
log?.("addMaxFileSize");
|
||||
schema.settings.maxFileSize = DEFAULT_SETTINGS.maxFileSize;
|
||||
}
|
||||
|
||||
return schema;
|
||||
}
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import { expect, test } from 'vitest';
|
||||
import { isRepoReindxingRequired } from './main';
|
||||
import { Repository } from './types';
|
||||
import { isAllRepoReindexingRequired, isRepoReindexingRequired } from './main';
|
||||
import { Repository, Settings } from './types';
|
||||
|
||||
test('isRepoReindexingRequired should return false when no changes are made', () => {
|
||||
const previous: Repository = {
|
||||
|
|
@ -15,7 +15,7 @@ test('isRepoReindexingRequired should return false when no changes are made', ()
|
|||
};
|
||||
const current = previous;
|
||||
|
||||
expect(isRepoReindxingRequired(previous, current)).toBe(false);
|
||||
expect(isRepoReindexingRequired(previous, current)).toBe(false);
|
||||
})
|
||||
|
||||
test('isRepoReindexingRequired should return true when git branches change', () => {
|
||||
|
|
@ -35,7 +35,7 @@ test('isRepoReindexingRequired should return true when git branches change', ()
|
|||
branches: ['main', 'feature']
|
||||
};
|
||||
|
||||
expect(isRepoReindxingRequired(previous, current)).toBe(true);
|
||||
expect(isRepoReindexingRequired(previous, current)).toBe(true);
|
||||
});
|
||||
|
||||
test('isRepoReindexingRequired should return true when git tags change', () => {
|
||||
|
|
@ -55,7 +55,7 @@ test('isRepoReindexingRequired should return true when git tags change', () => {
|
|||
tags: ['v1.0', 'v2.0']
|
||||
};
|
||||
|
||||
expect(isRepoReindxingRequired(previous, current)).toBe(true);
|
||||
expect(isRepoReindexingRequired(previous, current)).toBe(true);
|
||||
});
|
||||
|
||||
test('isRepoReindexingRequired should return true when local excludedPaths change', () => {
|
||||
|
|
@ -74,5 +74,26 @@ test('isRepoReindexingRequired should return true when local excludedPaths chang
|
|||
excludedPaths: ['node_modules', 'dist']
|
||||
};
|
||||
|
||||
expect(isRepoReindxingRequired(previous, current)).toBe(true);
|
||||
expect(isRepoReindexingRequired(previous, current)).toBe(true);
|
||||
});
|
||||
|
||||
test('isAllRepoReindexingRequired should return false when fileLimitSize has not changed', () => {
|
||||
const previous: Settings = {
|
||||
maxFileSize: 1000,
|
||||
}
|
||||
const current: Settings = {
|
||||
...previous,
|
||||
}
|
||||
expect(isAllRepoReindexingRequired(previous, current)).toBe(false);
|
||||
});
|
||||
|
||||
test('isAllRepoReindexingRequired should return true when fileLimitSize has changed', () => {
|
||||
const previous: Settings = {
|
||||
maxFileSize: 1000,
|
||||
}
|
||||
const current: Settings = {
|
||||
...previous,
|
||||
maxFileSize: 2000,
|
||||
}
|
||||
expect(isAllRepoReindexingRequired(previous, current)).toBe(true);
|
||||
});
|
||||
|
|
@ -5,12 +5,12 @@ import { getGitHubReposFromConfig } from "./github.js";
|
|||
import { getGitLabReposFromConfig } from "./gitlab.js";
|
||||
import { getGiteaReposFromConfig } from "./gitea.js";
|
||||
import { getGerritReposFromConfig } from "./gerrit.js";
|
||||
import { AppContext, LocalRepository, GitRepository, Repository } from "./types.js";
|
||||
import { AppContext, LocalRepository, GitRepository, Repository, Settings } from "./types.js";
|
||||
import { cloneRepository, fetchRepository } from "./git.js";
|
||||
import { createLogger } from "./logger.js";
|
||||
import { createRepository, Database, loadDB, updateRepository } from './db.js';
|
||||
import { createRepository, Database, loadDB, updateRepository, updateSettings } from './db.js';
|
||||
import { arraysEqualShallow, isRemotePath, measure } from "./utils.js";
|
||||
import { REINDEX_INTERVAL_MS, RESYNC_CONFIG_INTERVAL_MS } from "./constants.js";
|
||||
import { DEFAULT_SETTINGS, REINDEX_INTERVAL_MS, RESYNC_CONFIG_INTERVAL_MS } from "./constants.js";
|
||||
import stripJsonComments from 'strip-json-comments';
|
||||
import { indexGitRepository, indexLocalRepository } from "./zoekt.js";
|
||||
import { getLocalRepoFromConfig, initLocalRepoFileWatchers } from "./local.js";
|
||||
|
|
@ -18,7 +18,7 @@ import { captureEvent } from "./posthog.js";
|
|||
|
||||
const logger = createLogger('main');
|
||||
|
||||
const syncGitRepository = async (repo: GitRepository, ctx: AppContext) => {
|
||||
const syncGitRepository = async (repo: GitRepository, settings: Settings, ctx: AppContext) => {
|
||||
let fetchDuration_s: number | undefined = undefined;
|
||||
let cloneDuration_s: number | undefined = undefined;
|
||||
|
||||
|
|
@ -46,7 +46,7 @@ const syncGitRepository = async (repo: GitRepository, ctx: AppContext) => {
|
|||
}
|
||||
|
||||
logger.info(`Indexing ${repo.id}...`);
|
||||
const { durationMs } = await measure(() => indexGitRepository(repo, ctx));
|
||||
const { durationMs } = await measure(() => indexGitRepository(repo, settings, ctx));
|
||||
const indexDuration_s = durationMs / 1000;
|
||||
logger.info(`Indexed ${repo.id} in ${indexDuration_s}s`);
|
||||
|
||||
|
|
@ -57,9 +57,9 @@ const syncGitRepository = async (repo: GitRepository, ctx: AppContext) => {
|
|||
}
|
||||
}
|
||||
|
||||
const syncLocalRepository = async (repo: LocalRepository, ctx: AppContext, signal?: AbortSignal) => {
|
||||
const syncLocalRepository = async (repo: LocalRepository, settings: Settings, ctx: AppContext, signal?: AbortSignal) => {
|
||||
logger.info(`Indexing ${repo.id}...`);
|
||||
const { durationMs } = await measure(() => indexLocalRepository(repo, ctx, signal));
|
||||
const { durationMs } = await measure(() => indexLocalRepository(repo, settings, ctx, signal));
|
||||
const indexDuration_s = durationMs / 1000;
|
||||
logger.info(`Indexed ${repo.id} in ${indexDuration_s}s`);
|
||||
return {
|
||||
|
|
@ -67,8 +67,11 @@ const syncLocalRepository = async (repo: LocalRepository, ctx: AppContext, signa
|
|||
}
|
||||
}
|
||||
|
||||
export const isRepoReindxingRequired = (previous: Repository, current: Repository) => {
|
||||
|
||||
/**
|
||||
* Certain configuration changes (e.g., a branch is added) require
|
||||
* a reindexing of the repository.
|
||||
*/
|
||||
export const isRepoReindexingRequired = (previous: Repository, current: Repository) => {
|
||||
/**
|
||||
* Checks if the any of the `revisions` properties have changed.
|
||||
*/
|
||||
|
|
@ -100,6 +103,16 @@ export const isRepoReindxingRequired = (previous: Repository, current: Repositor
|
|||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Certain settings changes (e.g., the file limit size is changed) require
|
||||
* a reindexing of _all_ repositories.
|
||||
*/
|
||||
export const isAllRepoReindexingRequired = (previous: Settings, current: Settings) => {
|
||||
return (
|
||||
previous?.maxFileSize !== current?.maxFileSize
|
||||
)
|
||||
}
|
||||
|
||||
const syncConfig = async (configPath: string, db: Database, signal: AbortSignal, ctx: AppContext) => {
|
||||
const configContent = await (async () => {
|
||||
if (isRemotePath(configPath)) {
|
||||
|
|
@ -121,6 +134,13 @@ const syncConfig = async (configPath: string, db: Database, signal: AbortSignal,
|
|||
// @todo: we should validate the configuration file's structure here.
|
||||
const config = JSON.parse(stripJsonComments(configContent)) as SourcebotConfigurationSchema;
|
||||
|
||||
// Update the settings
|
||||
const updatedSettings: Settings = {
|
||||
maxFileSize: config.settings?.maxFileSize ?? DEFAULT_SETTINGS.maxFileSize,
|
||||
}
|
||||
const _isAllRepoReindexingRequired = isAllRepoReindexingRequired(db.data.settings, updatedSettings);
|
||||
await updateSettings(updatedSettings, db);
|
||||
|
||||
// Fetch all repositories from the config file
|
||||
let configRepos: Repository[] = [];
|
||||
for (const repoConfig of config.repos ?? []) {
|
||||
|
|
@ -172,7 +192,7 @@ const syncConfig = async (configPath: string, db: Database, signal: AbortSignal,
|
|||
for (const newRepo of configRepos) {
|
||||
if (newRepo.id in db.data.repos) {
|
||||
const existingRepo = db.data.repos[newRepo.id];
|
||||
const isReindexingRequired = isRepoReindxingRequired(existingRepo, newRepo);
|
||||
const isReindexingRequired = _isAllRepoReindexingRequired || isRepoReindexingRequired(existingRepo, newRepo);
|
||||
if (isReindexingRequired) {
|
||||
logger.info(`Marking ${newRepo.id} for reindexing due to configuration change.`);
|
||||
}
|
||||
|
|
@ -244,7 +264,7 @@ export const main = async (context: AppContext) => {
|
|||
const localRepos = Object.values(db.data.repos).filter(repo => repo.vcs === 'local');
|
||||
initLocalRepoFileWatchers(localRepos, async (repo, signal) => {
|
||||
logger.info(`Change detected to local repository ${repo.id}. Re-syncing...`);
|
||||
await syncLocalRepository(repo, context, signal);
|
||||
await syncLocalRepository(repo, db.data.settings, context, signal);
|
||||
await db.update(({ repos }) => repos[repo.id].lastIndexedDate = new Date().toUTCString());
|
||||
});
|
||||
}
|
||||
|
|
@ -285,12 +305,12 @@ export const main = async (context: AppContext) => {
|
|||
let cloneDuration_s: number | undefined;
|
||||
|
||||
if (repo.vcs === 'git') {
|
||||
const stats = await syncGitRepository(repo, context);
|
||||
const stats = await syncGitRepository(repo, db.data.settings, context);
|
||||
indexDuration_s = stats.indexDuration_s;
|
||||
fetchDuration_s = stats.fetchDuration_s;
|
||||
cloneDuration_s = stats.cloneDuration_s;
|
||||
} else if (repo.vcs === 'local') {
|
||||
const stats = await syncLocalRepository(repo, context);
|
||||
const stats = await syncLocalRepository(repo, db.data.settings, context);
|
||||
indexDuration_s = stats.indexDuration_s;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -7,11 +7,21 @@ export type Repos = GitHubConfig | GitLabConfig | GiteaConfig | GerritConfig | L
|
|||
*/
|
||||
export interface SourcebotConfigurationSchema {
|
||||
$schema?: string;
|
||||
settings?: Settings;
|
||||
/**
|
||||
* Defines a collection of repositories from varying code hosts that Sourcebot should sync with.
|
||||
*/
|
||||
repos?: Repos[];
|
||||
}
|
||||
/**
|
||||
* Global settings. These settings are applied to all repositories.
|
||||
*/
|
||||
export interface Settings {
|
||||
/**
|
||||
* The maximum size of a file (in bytes) to be indexed. Files that exceed this maximum will not be inexed. Defaults to 2MB (2097152 bytes).
|
||||
*/
|
||||
maxFileSize?: number;
|
||||
}
|
||||
export interface GitHubConfig {
|
||||
/**
|
||||
* GitHub Configuration
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
interface BaseRepository {
|
||||
vcs: 'git' | 'local';
|
||||
id: string;
|
||||
|
|
@ -42,3 +41,12 @@ export type AppContext = {
|
|||
|
||||
configPath: string;
|
||||
}
|
||||
|
||||
export type Settings = {
|
||||
maxFileSize: number;
|
||||
}
|
||||
|
||||
// @see : https://stackoverflow.com/a/61132308
|
||||
export type DeepPartial<T> = T extends object ? {
|
||||
[P in keyof T]?: DeepPartial<T[P]>;
|
||||
} : T;
|
||||
|
|
@ -1,16 +1,16 @@
|
|||
import { exec } from "child_process";
|
||||
import { AppContext, GitRepository, LocalRepository } from "./types.js";
|
||||
import { AppContext, GitRepository, LocalRepository, Settings } from "./types.js";
|
||||
|
||||
const ALWAYS_EXCLUDED_DIRS = ['.git', '.hg', '.svn'];
|
||||
|
||||
export const indexGitRepository = async (repo: GitRepository, ctx: AppContext) => {
|
||||
export const indexGitRepository = async (repo: GitRepository, settings: Settings, ctx: AppContext) => {
|
||||
const revisions = [
|
||||
'HEAD',
|
||||
...repo.branches ?? [],
|
||||
...repo.tags ?? [],
|
||||
];
|
||||
|
||||
const command = `zoekt-git-index -allow_missing_branches -index ${ctx.indexPath} -branches ${revisions.join(',')} ${repo.path}`;
|
||||
const command = `zoekt-git-index -allow_missing_branches -index ${ctx.indexPath} -file_limit ${settings.maxFileSize} -branches ${revisions.join(',')} ${repo.path}`;
|
||||
|
||||
return new Promise<{ stdout: string, stderr: string }>((resolve, reject) => {
|
||||
exec(command, (error, stdout, stderr) => {
|
||||
|
|
@ -26,9 +26,9 @@ export const indexGitRepository = async (repo: GitRepository, ctx: AppContext) =
|
|||
});
|
||||
}
|
||||
|
||||
export const indexLocalRepository = async (repo: LocalRepository, ctx: AppContext, signal?: AbortSignal) => {
|
||||
export const indexLocalRepository = async (repo: LocalRepository, settings: Settings, ctx: AppContext, signal?: AbortSignal) => {
|
||||
const excludedDirs = [...ALWAYS_EXCLUDED_DIRS, repo.excludedPaths];
|
||||
const command = `zoekt-index -index ${ctx.indexPath} -ignore_dirs ${excludedDirs.join(',')} ${repo.path}`;
|
||||
const command = `zoekt-index -index ${ctx.indexPath} -file_limit ${settings.maxFileSize} -ignore_dirs ${excludedDirs.join(',')} ${repo.path}`;
|
||||
|
||||
return new Promise<{ stdout: string, stderr: string }>((resolve, reject) => {
|
||||
exec(command, { signal }, (error, stdout, stderr) => {
|
||||
|
|
|
|||
|
|
@ -477,12 +477,28 @@
|
|||
"$ref": "#/definitions/LocalConfig"
|
||||
}
|
||||
]
|
||||
},
|
||||
"Settings": {
|
||||
"type": "object",
|
||||
"description": "Global settings. These settings are applied to all repositories.",
|
||||
"properties": {
|
||||
"maxFileSize": {
|
||||
"type": "integer",
|
||||
"description": "The maximum size of a file (in bytes) to be indexed. Files that exceed this maximum will not be inexed. Defaults to 2MB (2097152 bytes).",
|
||||
"default": 2097152,
|
||||
"minimum": 1
|
||||
}
|
||||
},
|
||||
"additionalProperties": false
|
||||
}
|
||||
},
|
||||
"properties": {
|
||||
"$schema": {
|
||||
"type": "string"
|
||||
},
|
||||
"settings": {
|
||||
"$ref": "#/definitions/Settings"
|
||||
},
|
||||
"repos": {
|
||||
"type": "array",
|
||||
"description": "Defines a collection of repositories from varying code hosts that Sourcebot should sync with.",
|
||||
|
|
|
|||
Loading…
Reference in a new issue