Add config option to specify max file size (#118)

This commit is contained in:
Brendan Kellam 2024-12-09 12:34:43 -10:00 committed by GitHub
parent d4e6410b28
commit 111023b1dc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 202 additions and 29 deletions

View file

@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Added
- Added config option `settings.maxFileSize` to control the maximum file size zoekt will index. ([#118](https://github.com/sourcebot-dev/sourcebot/pull/118))
## [2.6.0] - 2024-12-02
### Added

View file

@ -1,3 +1,4 @@
import { Settings } from "./types.js";
/**
* The interval to reindex a given repository.
@ -7,4 +8,11 @@ export const REINDEX_INTERVAL_MS = 1000 * 60 * 60;
/**
* The interval to re-sync the config.
*/
export const RESYNC_CONFIG_INTERVAL_MS = 1000 * 60 * 60 * 24;
export const RESYNC_CONFIG_INTERVAL_MS = 1000 * 60 * 60 * 24;
/**
* Default settings.
*/
export const DEFAULT_SETTINGS: Settings = {
maxFileSize: 2 * 1024 * 1024, // 2MB in bytes
}

View file

@ -0,0 +1,32 @@
import { expect, test } from 'vitest';
import { migration_addMaxFileSize, migration_addSettings, Schema } from './db';
import { DEFAULT_SETTINGS } from './constants';
import { DeepPartial } from './types';
test('migration_addSettings adds the `settings` field with defaults if it does not exist', () => {
const schema: DeepPartial<Schema> = {};
const migratedSchema = migration_addSettings(schema as Schema);
expect(migratedSchema).toStrictEqual({
settings: DEFAULT_SETTINGS,
});
});
test('migration_addMaxFileSize adds the `maxFileSize` field with the default value if it does not exist', () => {
const schema: DeepPartial<Schema> = {
settings: {},
}
const migratedSchema = migration_addMaxFileSize(schema as Schema);
expect(migratedSchema).toStrictEqual({
settings: {
maxFileSize: DEFAULT_SETTINGS.maxFileSize,
}
});
});
test('migration_addMaxFileSize will throw if `settings` is not defined', () => {
const schema: DeepPartial<Schema> = {};
expect(() => migration_addMaxFileSize(schema as Schema)).toThrow();
});

View file

@ -1,8 +1,13 @@
import { JSONFilePreset } from "lowdb/node";
import { type Low } from "lowdb";
import { AppContext, Repository } from "./types.js";
import { AppContext, Repository, Settings } from "./types.js";
import { DEFAULT_SETTINGS } from "./constants.js";
import { createLogger } from "./logger.js";
type Schema = {
const logger = createLogger('db');
export type Schema = {
settings: Settings,
repos: {
[key: string]: Repository;
}
@ -11,9 +16,16 @@ type Schema = {
export type Database = Low<Schema>;
export const loadDB = async (ctx: AppContext): Promise<Database> => {
const db = await JSONFilePreset<Schema>(`${ctx.cachePath}/db.json`, { repos: {} });
const db = await JSONFilePreset<Schema>(`${ctx.cachePath}/db.json`, {
repos: {},
settings: DEFAULT_SETTINGS,
});
await applyMigrations(db);
return db;
}
export const updateRepository = async (repoId: string, data: Repository, db: Database) => {
db.data.repos[repoId] = {
...db.data.repos[repoId],
@ -22,7 +34,49 @@ export const updateRepository = async (repoId: string, data: Repository, db: Dat
await db.write();
}
export const updateSettings = async (settings: Settings, db: Database) => {
db.data.settings = settings;
await db.write();
}
export const createRepository = async (repo: Repository, db: Database) => {
db.data.repos[repo.id] = repo;
await db.write();
}
export const applyMigrations = async (db: Database) => {
const log = (name: string) => {
logger.info(`Applying migration '${name}'`);
}
await db.update((schema) => {
// @NOTE: please ensure new migrations are added after older ones!
schema = migration_addSettings(schema, log);
schema = migration_addMaxFileSize(schema, log);
return schema;
});
}
/**
* @see: https://github.com/sourcebot-dev/sourcebot/pull/118
*/
export const migration_addSettings = (schema: Schema, log?: (name: string) => void) => {
if (!schema.settings) {
log?.("addSettings");
schema.settings = DEFAULT_SETTINGS;
}
return schema;
}
/**
* @see: https://github.com/sourcebot-dev/sourcebot/pull/118
*/
export const migration_addMaxFileSize = (schema: Schema, log?: (name: string) => void) => {
if (!schema.settings.maxFileSize) {
log?.("addMaxFileSize");
schema.settings.maxFileSize = DEFAULT_SETTINGS.maxFileSize;
}
return schema;
}

View file

@ -1,6 +1,6 @@
import { expect, test } from 'vitest';
import { isRepoReindxingRequired } from './main';
import { Repository } from './types';
import { isAllRepoReindexingRequired, isRepoReindexingRequired } from './main';
import { Repository, Settings } from './types';
test('isRepoReindexingRequired should return false when no changes are made', () => {
const previous: Repository = {
@ -15,7 +15,7 @@ test('isRepoReindexingRequired should return false when no changes are made', ()
};
const current = previous;
expect(isRepoReindxingRequired(previous, current)).toBe(false);
expect(isRepoReindexingRequired(previous, current)).toBe(false);
})
test('isRepoReindexingRequired should return true when git branches change', () => {
@ -35,7 +35,7 @@ test('isRepoReindexingRequired should return true when git branches change', ()
branches: ['main', 'feature']
};
expect(isRepoReindxingRequired(previous, current)).toBe(true);
expect(isRepoReindexingRequired(previous, current)).toBe(true);
});
test('isRepoReindexingRequired should return true when git tags change', () => {
@ -55,7 +55,7 @@ test('isRepoReindexingRequired should return true when git tags change', () => {
tags: ['v1.0', 'v2.0']
};
expect(isRepoReindxingRequired(previous, current)).toBe(true);
expect(isRepoReindexingRequired(previous, current)).toBe(true);
});
test('isRepoReindexingRequired should return true when local excludedPaths change', () => {
@ -74,5 +74,26 @@ test('isRepoReindexingRequired should return true when local excludedPaths chang
excludedPaths: ['node_modules', 'dist']
};
expect(isRepoReindxingRequired(previous, current)).toBe(true);
expect(isRepoReindexingRequired(previous, current)).toBe(true);
});
test('isAllRepoReindexingRequired should return false when fileLimitSize has not changed', () => {
const previous: Settings = {
maxFileSize: 1000,
}
const current: Settings = {
...previous,
}
expect(isAllRepoReindexingRequired(previous, current)).toBe(false);
});
test('isAllRepoReindexingRequired should return true when fileLimitSize has changed', () => {
const previous: Settings = {
maxFileSize: 1000,
}
const current: Settings = {
...previous,
maxFileSize: 2000,
}
expect(isAllRepoReindexingRequired(previous, current)).toBe(true);
});

View file

@ -5,12 +5,12 @@ import { getGitHubReposFromConfig } from "./github.js";
import { getGitLabReposFromConfig } from "./gitlab.js";
import { getGiteaReposFromConfig } from "./gitea.js";
import { getGerritReposFromConfig } from "./gerrit.js";
import { AppContext, LocalRepository, GitRepository, Repository } from "./types.js";
import { AppContext, LocalRepository, GitRepository, Repository, Settings } from "./types.js";
import { cloneRepository, fetchRepository } from "./git.js";
import { createLogger } from "./logger.js";
import { createRepository, Database, loadDB, updateRepository } from './db.js';
import { createRepository, Database, loadDB, updateRepository, updateSettings } from './db.js';
import { arraysEqualShallow, isRemotePath, measure } from "./utils.js";
import { REINDEX_INTERVAL_MS, RESYNC_CONFIG_INTERVAL_MS } from "./constants.js";
import { DEFAULT_SETTINGS, REINDEX_INTERVAL_MS, RESYNC_CONFIG_INTERVAL_MS } from "./constants.js";
import stripJsonComments from 'strip-json-comments';
import { indexGitRepository, indexLocalRepository } from "./zoekt.js";
import { getLocalRepoFromConfig, initLocalRepoFileWatchers } from "./local.js";
@ -18,7 +18,7 @@ import { captureEvent } from "./posthog.js";
const logger = createLogger('main');
const syncGitRepository = async (repo: GitRepository, ctx: AppContext) => {
const syncGitRepository = async (repo: GitRepository, settings: Settings, ctx: AppContext) => {
let fetchDuration_s: number | undefined = undefined;
let cloneDuration_s: number | undefined = undefined;
@ -46,7 +46,7 @@ const syncGitRepository = async (repo: GitRepository, ctx: AppContext) => {
}
logger.info(`Indexing ${repo.id}...`);
const { durationMs } = await measure(() => indexGitRepository(repo, ctx));
const { durationMs } = await measure(() => indexGitRepository(repo, settings, ctx));
const indexDuration_s = durationMs / 1000;
logger.info(`Indexed ${repo.id} in ${indexDuration_s}s`);
@ -57,9 +57,9 @@ const syncGitRepository = async (repo: GitRepository, ctx: AppContext) => {
}
}
const syncLocalRepository = async (repo: LocalRepository, ctx: AppContext, signal?: AbortSignal) => {
const syncLocalRepository = async (repo: LocalRepository, settings: Settings, ctx: AppContext, signal?: AbortSignal) => {
logger.info(`Indexing ${repo.id}...`);
const { durationMs } = await measure(() => indexLocalRepository(repo, ctx, signal));
const { durationMs } = await measure(() => indexLocalRepository(repo, settings, ctx, signal));
const indexDuration_s = durationMs / 1000;
logger.info(`Indexed ${repo.id} in ${indexDuration_s}s`);
return {
@ -67,8 +67,11 @@ const syncLocalRepository = async (repo: LocalRepository, ctx: AppContext, signa
}
}
export const isRepoReindxingRequired = (previous: Repository, current: Repository) => {
/**
* Certain configuration changes (e.g., a branch is added) require
* a reindexing of the repository.
*/
export const isRepoReindexingRequired = (previous: Repository, current: Repository) => {
/**
* Checks if the any of the `revisions` properties have changed.
*/
@ -100,6 +103,16 @@ export const isRepoReindxingRequired = (previous: Repository, current: Repositor
)
}
/**
* Certain settings changes (e.g., the file limit size is changed) require
* a reindexing of _all_ repositories.
*/
export const isAllRepoReindexingRequired = (previous: Settings, current: Settings) => {
return (
previous?.maxFileSize !== current?.maxFileSize
)
}
const syncConfig = async (configPath: string, db: Database, signal: AbortSignal, ctx: AppContext) => {
const configContent = await (async () => {
if (isRemotePath(configPath)) {
@ -121,6 +134,13 @@ const syncConfig = async (configPath: string, db: Database, signal: AbortSignal,
// @todo: we should validate the configuration file's structure here.
const config = JSON.parse(stripJsonComments(configContent)) as SourcebotConfigurationSchema;
// Update the settings
const updatedSettings: Settings = {
maxFileSize: config.settings?.maxFileSize ?? DEFAULT_SETTINGS.maxFileSize,
}
const _isAllRepoReindexingRequired = isAllRepoReindexingRequired(db.data.settings, updatedSettings);
await updateSettings(updatedSettings, db);
// Fetch all repositories from the config file
let configRepos: Repository[] = [];
for (const repoConfig of config.repos ?? []) {
@ -172,7 +192,7 @@ const syncConfig = async (configPath: string, db: Database, signal: AbortSignal,
for (const newRepo of configRepos) {
if (newRepo.id in db.data.repos) {
const existingRepo = db.data.repos[newRepo.id];
const isReindexingRequired = isRepoReindxingRequired(existingRepo, newRepo);
const isReindexingRequired = _isAllRepoReindexingRequired || isRepoReindexingRequired(existingRepo, newRepo);
if (isReindexingRequired) {
logger.info(`Marking ${newRepo.id} for reindexing due to configuration change.`);
}
@ -244,7 +264,7 @@ export const main = async (context: AppContext) => {
const localRepos = Object.values(db.data.repos).filter(repo => repo.vcs === 'local');
initLocalRepoFileWatchers(localRepos, async (repo, signal) => {
logger.info(`Change detected to local repository ${repo.id}. Re-syncing...`);
await syncLocalRepository(repo, context, signal);
await syncLocalRepository(repo, db.data.settings, context, signal);
await db.update(({ repos }) => repos[repo.id].lastIndexedDate = new Date().toUTCString());
});
}
@ -285,12 +305,12 @@ export const main = async (context: AppContext) => {
let cloneDuration_s: number | undefined;
if (repo.vcs === 'git') {
const stats = await syncGitRepository(repo, context);
const stats = await syncGitRepository(repo, db.data.settings, context);
indexDuration_s = stats.indexDuration_s;
fetchDuration_s = stats.fetchDuration_s;
cloneDuration_s = stats.cloneDuration_s;
} else if (repo.vcs === 'local') {
const stats = await syncLocalRepository(repo, context);
const stats = await syncLocalRepository(repo, db.data.settings, context);
indexDuration_s = stats.indexDuration_s;
}

View file

@ -7,11 +7,21 @@ export type Repos = GitHubConfig | GitLabConfig | GiteaConfig | GerritConfig | L
*/
export interface SourcebotConfigurationSchema {
$schema?: string;
settings?: Settings;
/**
* Defines a collection of repositories from varying code hosts that Sourcebot should sync with.
*/
repos?: Repos[];
}
/**
* Global settings. These settings are applied to all repositories.
*/
export interface Settings {
/**
* The maximum size of a file (in bytes) to be indexed. Files that exceed this maximum will not be inexed. Defaults to 2MB (2097152 bytes).
*/
maxFileSize?: number;
}
export interface GitHubConfig {
/**
* GitHub Configuration

View file

@ -1,4 +1,3 @@
interface BaseRepository {
vcs: 'git' | 'local';
id: string;
@ -42,3 +41,12 @@ export type AppContext = {
configPath: string;
}
export type Settings = {
maxFileSize: number;
}
// @see : https://stackoverflow.com/a/61132308
export type DeepPartial<T> = T extends object ? {
[P in keyof T]?: DeepPartial<T[P]>;
} : T;

View file

@ -1,16 +1,16 @@
import { exec } from "child_process";
import { AppContext, GitRepository, LocalRepository } from "./types.js";
import { AppContext, GitRepository, LocalRepository, Settings } from "./types.js";
const ALWAYS_EXCLUDED_DIRS = ['.git', '.hg', '.svn'];
export const indexGitRepository = async (repo: GitRepository, ctx: AppContext) => {
export const indexGitRepository = async (repo: GitRepository, settings: Settings, ctx: AppContext) => {
const revisions = [
'HEAD',
...repo.branches ?? [],
...repo.tags ?? [],
];
const command = `zoekt-git-index -allow_missing_branches -index ${ctx.indexPath} -branches ${revisions.join(',')} ${repo.path}`;
const command = `zoekt-git-index -allow_missing_branches -index ${ctx.indexPath} -file_limit ${settings.maxFileSize} -branches ${revisions.join(',')} ${repo.path}`;
return new Promise<{ stdout: string, stderr: string }>((resolve, reject) => {
exec(command, (error, stdout, stderr) => {
@ -26,9 +26,9 @@ export const indexGitRepository = async (repo: GitRepository, ctx: AppContext) =
});
}
export const indexLocalRepository = async (repo: LocalRepository, ctx: AppContext, signal?: AbortSignal) => {
export const indexLocalRepository = async (repo: LocalRepository, settings: Settings, ctx: AppContext, signal?: AbortSignal) => {
const excludedDirs = [...ALWAYS_EXCLUDED_DIRS, repo.excludedPaths];
const command = `zoekt-index -index ${ctx.indexPath} -ignore_dirs ${excludedDirs.join(',')} ${repo.path}`;
const command = `zoekt-index -index ${ctx.indexPath} -file_limit ${settings.maxFileSize} -ignore_dirs ${excludedDirs.join(',')} ${repo.path}`;
return new Promise<{ stdout: string, stderr: string }>((resolve, reject) => {
exec(command, { signal }, (error, stdout, stderr) => {

View file

@ -477,12 +477,28 @@
"$ref": "#/definitions/LocalConfig"
}
]
},
"Settings": {
"type": "object",
"description": "Global settings. These settings are applied to all repositories.",
"properties": {
"maxFileSize": {
"type": "integer",
"description": "The maximum size of a file (in bytes) to be indexed. Files that exceed this maximum will not be inexed. Defaults to 2MB (2097152 bytes).",
"default": 2097152,
"minimum": 1
}
},
"additionalProperties": false
}
},
"properties": {
"$schema": {
"type": "string"
},
"settings": {
"$ref": "#/definitions/Settings"
},
"repos": {
"type": "array",
"description": "Defines a collection of repositories from varying code hosts that Sourcebot should sync with.",