SQL Database (#157)

This commit is contained in:
Brendan Kellam 2025-01-14 13:37:31 -08:00 committed by GitHub
parent 75999800e7
commit 15b9e777e4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
30 changed files with 851 additions and 1086 deletions

View file

@ -31,6 +31,15 @@ ENV NEXT_PUBLIC_POSTHOG_PAPIK=BAKED_NEXT_PUBLIC_POSTHOG_PAPIK
ARG NEXT_PUBLIC_DOMAIN_SUB_PATH=/BAKED_NEXT_PUBLIC_DOMAIN_SUB_PATH
RUN yarn workspace @sourcebot/web build
# ------ Build Database ------
FROM node-alpine AS database-builder
WORKDIR /app
COPY package.json yarn.lock* ./
COPY ./packages/db ./packages/db
RUN yarn workspace @sourcebot/db install --frozen-lockfile
# ------ Build Backend ------
FROM node-alpine AS backend-builder
WORKDIR /app
@ -38,6 +47,8 @@ WORKDIR /app
COPY package.json yarn.lock* ./
COPY ./schemas ./schemas
COPY ./packages/backend ./packages/backend
COPY --from=database-builder /app/node_modules ./node_modules
COPY --from=database-builder /app/packages/db ./packages/db
RUN yarn workspace @sourcebot/backend install --frozen-lockfile
RUN yarn workspace @sourcebot/backend build
@ -100,6 +111,10 @@ COPY --from=web-builder /app/packages/web/.next/static ./packages/web/.next/stat
COPY --from=backend-builder /app/node_modules ./node_modules
COPY --from=backend-builder /app/packages/backend ./packages/backend
# Configure the database
COPY --from=database-builder /app/node_modules ./node_modules
COPY --from=database-builder /app/packages/db ./packages/db
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
COPY prefix-output.sh ./prefix-output.sh
RUN chmod +x ./prefix-output.sh

View file

@ -1,10 +1,11 @@
CMDS := zoekt ui
CMDS := zoekt yarn
ALL: $(CMDS)
ui:
yarn:
yarn install
yarn workspace @sourcebot/db prisma:migrate:dev
zoekt:
mkdir -p bin
@ -20,6 +21,8 @@ clean:
packages/web/.next \
packages/backend/dist \
packages/backend/node_modules \
packages/db/node_modules \
packages/db/dist \
.sourcebot
.PHONY: bin

View file

@ -19,6 +19,11 @@ if [ ! -d "$DATA_CACHE_DIR" ]; then
mkdir -p "$DATA_CACHE_DIR"
fi
# Run a Database migration
echo -e "\e[34m[Info] Running database migration...\e[0m"
export DATABASE_URL="file:$DATA_CACHE_DIR/db.sqlite"
yarn workspace @sourcebot/db prisma:migrate:prod
# In order to detect if this is the first run, we create a `.installed` file in
# the cache directory.
FIRST_RUN_FILE="$DATA_CACHE_DIR/.installedv2"

View file

@ -0,0 +1,134 @@
import { PrismaClient } from '@sourcebot/db';
import { readFile } from 'fs/promises';
import stripJsonComments from 'strip-json-comments';
import { getGitHubReposFromConfig } from "./github.js";
import { getGitLabReposFromConfig, GITLAB_CLOUD_HOSTNAME } from "./gitlab.js";
import { SourcebotConfigurationSchema } from "./schemas/v2.js";
import { AppContext } from "./types.js";
import { getTokenFromConfig, isRemotePath, marshalBool } from "./utils.js";
export const syncConfig = async (configPath: string, db: PrismaClient, signal: AbortSignal, ctx: AppContext) => {
const configContent = await (async () => {
if (isRemotePath(configPath)) {
const response = await fetch(configPath, {
signal,
});
if (!response.ok) {
throw new Error(`Failed to fetch config file ${configPath}: ${response.statusText}`);
}
return response.text();
} else {
return readFile(configPath, {
encoding: 'utf-8',
signal,
});
}
})();
// @todo: we should validate the configuration file's structure here.
const config = JSON.parse(stripJsonComments(configContent)) as SourcebotConfigurationSchema;
for (const repoConfig of config.repos ?? []) {
switch (repoConfig.type) {
case 'github': {
const token = repoConfig.token ? getTokenFromConfig(repoConfig.token, ctx) : undefined;
const gitHubRepos = await getGitHubReposFromConfig(repoConfig, signal, ctx);
const hostUrl = repoConfig.url ?? 'https://github.com';
const hostname = repoConfig.url ? new URL(repoConfig.url).hostname : 'github.com';
await Promise.all(gitHubRepos.map((repo) => {
const repoName = `${hostname}/${repo.full_name}`;
const cloneUrl = new URL(repo.clone_url!);
if (token) {
cloneUrl.username = token;
}
const data = {
external_id: repo.id.toString(),
external_codeHostType: 'github',
external_codeHostUrl: hostUrl,
cloneUrl: cloneUrl.toString(),
name: repoName,
isFork: repo.fork,
isArchived: !!repo.archived,
metadata: {
'zoekt.web-url-type': 'github',
'zoekt.web-url': repo.html_url,
'zoekt.name': repoName,
'zoekt.github-stars': (repo.stargazers_count ?? 0).toString(),
'zoekt.github-watchers': (repo.watchers_count ?? 0).toString(),
'zoekt.github-subscribers': (repo.subscribers_count ?? 0).toString(),
'zoekt.github-forks': (repo.forks_count ?? 0).toString(),
'zoekt.archived': marshalBool(repo.archived),
'zoekt.fork': marshalBool(repo.fork),
'zoekt.public': marshalBool(repo.private === false)
},
};
return db.repo.upsert({
where: {
external_id_external_codeHostUrl: {
external_id: repo.id.toString(),
external_codeHostUrl: hostUrl,
},
},
create: data,
update: data,
})
}));
break;
}
case 'gitlab': {
const hostUrl = repoConfig.url ?? 'https://gitlab.com';
const hostname = repoConfig.url ? new URL(repoConfig.url).hostname : GITLAB_CLOUD_HOSTNAME;
const token = repoConfig.token ? getTokenFromConfig(repoConfig.token, ctx) : undefined;
const gitLabRepos = await getGitLabReposFromConfig(repoConfig, ctx);
await Promise.all(gitLabRepos.map((project) => {
const repoName = `${hostname}/${project.path_with_namespace}`;
const isFork = project.forked_from_project !== undefined;
const cloneUrl = new URL(project.http_url_to_repo);
if (token) {
cloneUrl.username = 'oauth2';
cloneUrl.password = token;
}
const data = {
external_id: project.id.toString(),
external_codeHostType: 'gitlab',
external_codeHostUrl: hostUrl,
cloneUrl: cloneUrl.toString(),
name: repoName,
isFork,
isArchived: project.archived,
metadata: {
'zoekt.web-url-type': 'gitlab',
'zoekt.web-url': project.web_url,
'zoekt.name': repoName,
'zoekt.gitlab-stars': project.star_count?.toString() ?? '0',
'zoekt.gitlab-forks': project.forks_count?.toString() ?? '0',
'zoekt.archived': marshalBool(project.archived),
'zoekt.fork': marshalBool(isFork),
'zoekt.public': marshalBool(project.visibility === 'public'),
}
}
return db.repo.upsert({
where: {
external_id_external_codeHostUrl: {
external_id: project.id.toString(),
external_codeHostUrl: hostUrl,
},
},
create: data,
update: data,
})
}));
break;
}
}
}
}

View file

@ -6,6 +6,6 @@ import { Settings } from "./types.js";
export const DEFAULT_SETTINGS: Settings = {
maxFileSize: 2 * 1024 * 1024, // 2MB in bytes
autoDeleteStaleRepos: true,
reindexInterval: 1000 * 60 * 60, // 1 hour in milliseconds
resyncInterval: 1000 * 60 * 60 * 24, // 1 day in milliseconds
reindexIntervalMs: 1000 * 60 * 60, // 1 hour in milliseconds
resyncIntervalMs: 1000 * 60 * 60 * 24, // 1 day in milliseconds
}

View file

@ -1,125 +0,0 @@
import { expect, test } from 'vitest';
import { DEFAULT_DB_DATA, migration_addDeleteStaleRepos, migration_addMaxFileSize, migration_addReindexInterval, migration_addResyncInterval, migration_addSettings, Schema } from './db';
import { DEFAULT_SETTINGS } from './constants';
import { DeepPartial } from './types';
import { Low } from 'lowdb';
class InMemoryAdapter<T> {
private data: T;
async read() {
return this.data;
}
async write(data: T) {
this.data = data;
}
}
export const createMockDB = (defaultData: Schema = DEFAULT_DB_DATA) => {
const db = new Low(new InMemoryAdapter<Schema>(), defaultData);
return db;
}
test('migration_addSettings adds the `settings` field with defaults if it does not exist', () => {
const schema: DeepPartial<Schema> = {};
const migratedSchema = migration_addSettings(schema as Schema);
expect(migratedSchema).toStrictEqual({
settings: DEFAULT_SETTINGS,
});
});
test('migration_addMaxFileSize adds the `maxFileSize` field with the default value if it does not exist', () => {
const schema: DeepPartial<Schema> = {
settings: {},
}
const migratedSchema = migration_addMaxFileSize(schema as Schema);
expect(migratedSchema).toStrictEqual({
settings: {
maxFileSize: DEFAULT_SETTINGS.maxFileSize,
}
});
});
test('migration_addMaxFileSize will throw if `settings` is not defined', () => {
const schema: DeepPartial<Schema> = {};
expect(() => migration_addMaxFileSize(schema as Schema)).toThrow();
});
test('migration_addDeleteStaleRepos adds the `autoDeleteStaleRepos` field with the default value if it does not exist', () => {
const schema: DeepPartial<Schema> = {
settings: {
maxFileSize: DEFAULT_SETTINGS.maxFileSize,
},
}
const migratedSchema = migration_addDeleteStaleRepos(schema as Schema);
expect(migratedSchema).toStrictEqual({
settings: {
maxFileSize: DEFAULT_SETTINGS.maxFileSize,
autoDeleteStaleRepos: DEFAULT_SETTINGS.autoDeleteStaleRepos,
}
});
});
test('migration_addReindexInterval adds the `reindexInterval` field with the default value if it does not exist', () => {
const schema: DeepPartial<Schema> = {
settings: {
maxFileSize: DEFAULT_SETTINGS.maxFileSize,
autoDeleteStaleRepos: DEFAULT_SETTINGS.autoDeleteStaleRepos,
},
}
const migratedSchema = migration_addReindexInterval(schema as Schema);
expect(migratedSchema).toStrictEqual({
settings: {
maxFileSize: DEFAULT_SETTINGS.maxFileSize,
autoDeleteStaleRepos: DEFAULT_SETTINGS.autoDeleteStaleRepos,
reindexInterval: DEFAULT_SETTINGS.reindexInterval,
}
});
});
test('migration_addReindexInterval preserves existing reindexInterval value if already set', () => {
const customInterval = 60;
const schema: DeepPartial<Schema> = {
settings: {
maxFileSize: DEFAULT_SETTINGS.maxFileSize,
reindexInterval: customInterval,
},
}
const migratedSchema = migration_addReindexInterval(schema as Schema);
expect(migratedSchema.settings.reindexInterval).toBe(customInterval);
});
test('migration_addResyncInterval adds the `resyncInterval` field with the default value if it does not exist', () => {
const schema: DeepPartial<Schema> = {
settings: {
maxFileSize: DEFAULT_SETTINGS.maxFileSize,
autoDeleteStaleRepos: DEFAULT_SETTINGS.autoDeleteStaleRepos,
},
}
const migratedSchema = migration_addResyncInterval(schema as Schema);
expect(migratedSchema).toStrictEqual({
settings: {
maxFileSize: DEFAULT_SETTINGS.maxFileSize,
autoDeleteStaleRepos: DEFAULT_SETTINGS.autoDeleteStaleRepos,
resyncInterval: DEFAULT_SETTINGS.resyncInterval,
}
});
});
test('migration_addResyncInterval preserves existing resyncInterval value if already set', () => {
const customInterval = 120;
const schema: DeepPartial<Schema> = {
settings: {
maxFileSize: DEFAULT_SETTINGS.maxFileSize,
resyncInterval: customInterval,
},
}
const migratedSchema = migration_addResyncInterval(schema as Schema);
expect(migratedSchema.settings.resyncInterval).toBe(customInterval);
});

View file

@ -1,123 +0,0 @@
import { JSONFilePreset } from "lowdb/node";
import { type Low } from "lowdb";
import { AppContext, Repository, Settings } from "./types.js";
import { DEFAULT_SETTINGS } from "./constants.js";
import { createLogger } from "./logger.js";
const logger = createLogger('db');
export type Schema = {
settings: Settings,
repos: {
[key: string]: Repository;
}
}
export const DEFAULT_DB_DATA: Schema = {
repos: {},
settings: DEFAULT_SETTINGS,
}
export type Database = Low<Schema>;
export const loadDB = async (ctx: AppContext): Promise<Database> => {
const db = await JSONFilePreset<Schema>(`${ctx.cachePath}/db.json`, DEFAULT_DB_DATA);
await applyMigrations(db);
return db;
}
export const updateRepository = async (repoId: string, data: Repository, db: Database) => {
db.data.repos[repoId] = {
...db.data.repos[repoId],
...data,
}
await db.write();
}
export const updateSettings = async (settings: Settings, db: Database) => {
db.data.settings = settings;
await db.write();
}
export const createRepository = async (repo: Repository, db: Database) => {
db.data.repos[repo.id] = repo;
await db.write();
}
export const applyMigrations = async (db: Database) => {
const log = (name: string) => {
logger.info(`Applying migration '${name}'`);
}
await db.update((schema) => {
// @NOTE: please ensure new migrations are added after older ones!
schema = migration_addSettings(schema, log);
schema = migration_addMaxFileSize(schema, log);
schema = migration_addDeleteStaleRepos(schema, log);
schema = migration_addReindexInterval(schema, log);
schema = migration_addResyncInterval(schema, log);
return schema;
});
}
/**
* @see: https://github.com/sourcebot-dev/sourcebot/pull/118
*/
export const migration_addSettings = (schema: Schema, log?: (name: string) => void) => {
if (!schema.settings) {
log?.("addSettings");
schema.settings = DEFAULT_SETTINGS;
}
return schema;
}
/**
* @see: https://github.com/sourcebot-dev/sourcebot/pull/118
*/
export const migration_addMaxFileSize = (schema: Schema, log?: (name: string) => void) => {
if (!schema.settings.maxFileSize) {
log?.("addMaxFileSize");
schema.settings.maxFileSize = DEFAULT_SETTINGS.maxFileSize;
}
return schema;
}
/**
* @see: https://github.com/sourcebot-dev/sourcebot/pull/128
*/
export const migration_addDeleteStaleRepos = (schema: Schema, log?: (name: string) => void) => {
if (schema.settings.autoDeleteStaleRepos === undefined) {
log?.("addDeleteStaleRepos");
schema.settings.autoDeleteStaleRepos = DEFAULT_SETTINGS.autoDeleteStaleRepos;
}
return schema;
}
/**
* @see: https://github.com/sourcebot-dev/sourcebot/pull/134
*/
export const migration_addReindexInterval = (schema: Schema, log?: (name: string) => void) => {
if (schema.settings.reindexInterval === undefined) {
log?.("addReindexInterval");
schema.settings.reindexInterval = DEFAULT_SETTINGS.reindexInterval;
}
return schema;
}
/**
* @see: https://github.com/sourcebot-dev/sourcebot/pull/134
*/
export const migration_addResyncInterval = (schema: Schema, log?: (name: string) => void) => {
if (schema.settings.resyncInterval === undefined) {
log?.("addResyncInterval");
schema.settings.resyncInterval = DEFAULT_SETTINGS.resyncInterval;
}
return schema;
}

View file

@ -1,48 +1,42 @@
import { GitRepository, AppContext } from './types.js';
import { simpleGit, SimpleGitProgressEvent } from 'simple-git';
import { existsSync } from 'fs';
import { createLogger } from './logger.js';
import { GitConfig } from './schemas/v2.js';
import path from 'path';
const logger = createLogger('git');
export const cloneRepository = async (repo: GitRepository, onProgress?: (event: SimpleGitProgressEvent) => void) => {
if (existsSync(repo.path)) {
logger.warn(`${repo.id} already exists. Skipping clone.`)
return;
}
export const cloneRepository = async (cloneURL: string, path: string, gitConfig?: Record<string, string>, onProgress?: (event: SimpleGitProgressEvent) => void) => {
const git = simpleGit({
progress: onProgress,
});
const gitConfig = Object.entries(repo.gitConfigMetadata ?? {}).flatMap(
const configParams = Object.entries(gitConfig ?? {}).flatMap(
([key, value]) => ['--config', `${key}=${value}`]
);
await git.clone(
repo.cloneUrl,
repo.path,
cloneURL,
path,
[
"--bare",
...gitConfig
...configParams
]
);
await git.cwd({
path: repo.path,
path,
}).addConfig("remote.origin.fetch", "+refs/heads/*:refs/heads/*");
}
export const fetchRepository = async (repo: GitRepository, onProgress?: (event: SimpleGitProgressEvent) => void) => {
export const fetchRepository = async (path: string, onProgress?: (event: SimpleGitProgressEvent) => void) => {
const git = simpleGit({
progress: onProgress,
});
await git.cwd({
path: repo.path,
path: path,
}).fetch(
"origin",
[

View file

@ -0,0 +1,206 @@
import { expect, test } from 'vitest';
import { OctokitRepository, shouldExcludeRepo } from './github';
test('shouldExcludeRepo returns true when clone_url is undefined', () => {
const repo = { full_name: 'test/repo' } as OctokitRepository;
expect(shouldExcludeRepo({
repo,
})).toBe(true);
});
test('shouldExcludeRepo returns false when the repo is not excluded.', () => {
const repo = {
full_name: 'test/repo',
clone_url: 'https://github.com/test/repo.git',
} as OctokitRepository;
expect(shouldExcludeRepo({
repo,
})).toBe(false);
});
test('shouldExcludeRepo handles forked repos correctly', () => {
const repo = {
full_name: 'test/forked-repo',
clone_url: 'https://github.com/test/forked-repo.git',
fork: true,
} as OctokitRepository;
expect(shouldExcludeRepo({ repo })).toBe(false);
expect(shouldExcludeRepo({ repo, exclude: { forks: true } })).toBe(true);
expect(shouldExcludeRepo({ repo, exclude: { forks: false } })).toBe(false);
});;
test('shouldExcludeRepo handles archived repos correctly', () => {
const repo = {
full_name: 'test/archived-repo',
clone_url: 'https://github.com/test/archived-repo.git',
archived: true,
} as OctokitRepository;
expect(shouldExcludeRepo({ repo })).toBe(false);
expect(shouldExcludeRepo({ repo, exclude: { archived: true } })).toBe(true);
expect(shouldExcludeRepo({ repo, exclude: { archived: false } })).toBe(false);
});
test('shouldExcludeRepo handles include.topics correctly', () => {
const repo = {
full_name: 'test/repo',
clone_url: 'https://github.com/test/repo.git',
topics: [
'test-topic',
'another-topic'
] as string[],
} as OctokitRepository;
expect(shouldExcludeRepo({
repo,
include: {}
})).toBe(false);
expect(shouldExcludeRepo({
repo,
include: {
topics: [],
}
})).toBe(true);
expect(shouldExcludeRepo({
repo,
include: {
topics: ['a-topic-that-does-not-exist'],
}
})).toBe(true);
expect(shouldExcludeRepo({
repo,
include: {
topics: ['test-topic'],
}
})).toBe(false);
expect(shouldExcludeRepo({
repo,
include: {
topics: ['test-*'],
}
})).toBe(false);
expect(shouldExcludeRepo({
repo,
include: {
topics: ['TEST-tOpIC'],
}
})).toBe(false);
});
test('shouldExcludeRepo handles exclude.topics correctly', () => {
const repo = {
full_name: 'test/repo',
clone_url: 'https://github.com/test/repo.git',
topics: [
'test-topic',
'another-topic'
],
} as OctokitRepository;
expect(shouldExcludeRepo({
repo,
exclude: {}
})).toBe(false);
expect(shouldExcludeRepo({
repo,
exclude: {
topics: [],
}
})).toBe(false);
expect(shouldExcludeRepo({
repo,
exclude: {
topics: ['a-topic-that-does-not-exist'],
}
})).toBe(false);
expect(shouldExcludeRepo({
repo,
exclude: {
topics: ['test-topic'],
}
})).toBe(true);
expect(shouldExcludeRepo({
repo,
exclude: {
topics: ['test-*'],
}
})).toBe(true);
expect(shouldExcludeRepo({
repo,
exclude: {
topics: ['TEST-tOpIC'],
}
})).toBe(true);
});
test('shouldExcludeRepo handles exclude.size correctly', () => {
const repo = {
full_name: 'test/repo',
clone_url: 'https://github.com/test/repo.git',
size: 6, // 6KB
} as OctokitRepository;
expect(shouldExcludeRepo({
repo,
exclude: {
size: {
min: 10 * 1000, // 10KB
}
}
})).toBe(true);
expect(shouldExcludeRepo({
repo,
exclude: {
size: {
max: 2 * 1000, // 2KB
}
}
})).toBe(true);
expect(shouldExcludeRepo({
repo,
exclude: {
size: {
min: 5 * 1000, // 5KB
max: 10 * 1000, // 10KB
}
}
})).toBe(false);
});
test('shouldExcludeRepo handles exclude.repos correctly', () => {
const repo = {
full_name: 'test/example-repo',
clone_url: 'https://github.com/test/example-repo.git',
} as OctokitRepository;
expect(shouldExcludeRepo({
repo,
exclude: {
repos: []
}
})).toBe(false);
expect(shouldExcludeRepo({
repo,
exclude: {
repos: ['test/example-repo']
}
})).toBe(true);
expect(shouldExcludeRepo({
repo,
exclude: {
repos: ['test/*']
}
})).toBe(true);
expect(shouldExcludeRepo({
repo,
exclude: {
repos: ['repo-does-not-exist']
}
})).toBe(false);
});

View file

@ -1,14 +1,13 @@
import { Octokit } from "@octokit/rest";
import { GitHubConfig } from "./schemas/v2.js";
import { createLogger } from "./logger.js";
import { AppContext, GitRepository } from "./types.js";
import path from 'path';
import { excludeArchivedRepos, excludeForkedRepos, excludeReposByName, excludeReposByTopic, getTokenFromConfig, includeReposByTopic, marshalBool, measure } from "./utils.js";
import { AppContext } from "./types.js";
import { getTokenFromConfig, measure } from "./utils.js";
import micromatch from "micromatch";
const logger = createLogger("GitHub");
type OctokitRepository = {
export type OctokitRepository = {
name: string,
id: number,
full_name: string,
@ -22,6 +21,7 @@ type OctokitRepository = {
forks_count?: number,
archived?: boolean,
topics?: string[],
// @note: this is expressed in kilobytes.
size?: number,
}
@ -54,191 +54,123 @@ export const getGitHubReposFromConfig = async (config: GitHubConfig, signal: Abo
}
// Marshall results to our type
let repos: GitRepository[] = allRepos
let repos = allRepos
.filter((repo) => {
if (!repo.clone_url) {
logger.warn(`Repository ${repo.name} missing property 'clone_url'. Excluding.`)
return false;
}
return true;
})
.map((repo) => {
const hostname = config.url ? new URL(config.url).hostname : 'github.com';
const repoId = `${hostname}/${repo.full_name}`;
const repoPath = path.resolve(path.join(ctx.reposPath, `${repoId}.git`));
const cloneUrl = new URL(repo.clone_url!);
if (token) {
cloneUrl.username = token;
}
return {
vcs: 'git',
codeHost: 'github',
name: repo.full_name,
id: repoId,
cloneUrl: cloneUrl.toString(),
path: repoPath,
isStale: false,
isFork: repo.fork,
isArchived: !!repo.archived,
topics: repo.topics ?? [],
gitConfigMetadata: {
'zoekt.web-url-type': 'github',
'zoekt.web-url': repo.html_url,
'zoekt.name': repoId,
'zoekt.github-stars': (repo.stargazers_count ?? 0).toString(),
'zoekt.github-watchers': (repo.watchers_count ?? 0).toString(),
'zoekt.github-subscribers': (repo.subscribers_count ?? 0).toString(),
'zoekt.github-forks': (repo.forks_count ?? 0).toString(),
'zoekt.archived': marshalBool(repo.archived),
'zoekt.fork': marshalBool(repo.fork),
'zoekt.public': marshalBool(repo.private === false)
const isExcluded = shouldExcludeRepo({
repo,
include: {
topics: config.topics,
},
sizeInBytes: repo.size ? repo.size * 1000 : undefined,
branches: [],
tags: [],
} satisfies GitRepository;
exclude: config.exclude,
});
return !isExcluded;
});
if (config.topics) {
const topics = config.topics.map(topic => topic.toLowerCase());
repos = includeReposByTopic(repos, topics, logger);
}
if (config.exclude) {
if (!!config.exclude.forks) {
repos = excludeForkedRepos(repos, logger);
}
if (!!config.exclude.archived) {
repos = excludeArchivedRepos(repos, logger);
}
if (config.exclude.repos) {
repos = excludeReposByName(repos, config.exclude.repos, logger);
}
if (config.exclude.topics) {
const topics = config.exclude.topics.map(topic => topic.toLowerCase());
repos = excludeReposByTopic(repos, topics, logger);
}
if (config.exclude.size) {
const min = config.exclude.size.min;
const max = config.exclude.size.max;
if (min) {
repos = repos.filter((repo) => {
// If we don't have a size, we can't filter by size.
if (!repo.sizeInBytes) {
return true;
}
if (repo.sizeInBytes < min) {
logger.debug(`Excluding repo ${repo.name}. Reason: repo is less than \`exclude.size.min\`=${min} bytes.`);
return false;
}
return true;
});
}
if (max) {
repos = repos.filter((repo) => {
// If we don't have a size, we can't filter by size.
if (!repo.sizeInBytes) {
return true;
}
if (repo.sizeInBytes > max) {
logger.debug(`Excluding repo ${repo.name}. Reason: repo is greater than \`exclude.size.max\`=${max} bytes.`);
return false;
}
return true;
});
}
}
}
logger.debug(`Found ${repos.length} total repositories.`);
if (config.revisions) {
if (config.revisions.branches) {
const branchGlobs = config.revisions.branches;
repos = await Promise.all(
repos.map(async (repo) => {
const [owner, name] = repo.name.split('/');
let branches = (await getBranchesForRepo(owner, name, octokit, signal)).map(branch => branch.name);
branches = micromatch.match(branches, branchGlobs);
return {
...repo,
branches,
};
})
)
}
if (config.revisions.tags) {
const tagGlobs = config.revisions.tags;
repos = await Promise.all(
repos.map(async (repo) => {
const [owner, name] = repo.name.split('/');
let tags = (await getTagsForRepo(owner, name, octokit, signal)).map(tag => tag.name);
tags = micromatch.match(tags, tagGlobs);
return {
...repo,
tags,
};
})
)
}
}
return repos;
}
const getTagsForRepo = async (owner: string, repo: string, octokit: Octokit, signal: AbortSignal) => {
try {
logger.debug(`Fetching tags for repo ${owner}/${repo}...`);
const { durationMs, data: tags } = await measure(() => octokit.paginate(octokit.repos.listTags, {
owner,
repo,
per_page: 100,
request: {
signal
}
}));
export const getGitHubRepoFromId = async (id: string, hostURL: string, token?: string) => {
const octokit = new Octokit({
auth: token,
...(hostURL !== 'https://github.com' ? {
baseUrl: `${hostURL}/api/v3`
} : {})
});
logger.debug(`Found ${tags.length} tags for repo ${owner}/${repo} in ${durationMs}ms`);
return tags;
} catch (e) {
logger.debug(`Error fetching tags for repo ${owner}/${repo}: ${e}`);
return [];
}
const repo = await octokit.request('GET /repositories/:id', {
id,
});
return repo;
}
const getBranchesForRepo = async (owner: string, repo: string, octokit: Octokit, signal: AbortSignal) => {
try {
logger.debug(`Fetching branches for repo ${owner}/${repo}...`);
const { durationMs, data: branches } = await measure(() => octokit.paginate(octokit.repos.listBranches, {
owner,
repo,
per_page: 100,
request: {
signal
}
}));
logger.debug(`Found ${branches.length} branches for repo ${owner}/${repo} in ${durationMs}ms`);
return branches;
} catch (e) {
logger.debug(`Error fetching branches for repo ${owner}/${repo}: ${e}`);
return [];
}
}
export const shouldExcludeRepo = ({
repo,
include,
exclude
} : {
repo: OctokitRepository,
include?: {
topics?: GitHubConfig['topics']
},
exclude?: GitHubConfig['exclude']
}) => {
let reason = '';
const repoName = repo.full_name;
const shouldExclude = (() => {
if (!repo.clone_url) {
reason = 'clone_url is undefined';
return true;
}
if (!!exclude?.forks && repo.fork) {
reason = `\`exclude.forks\` is true`;
return true;
}
if (!!exclude?.archived && !!repo.archived) {
reason = `\`exclude.archived\` is true`;
return true;
}
if (exclude?.repos) {
if (micromatch.isMatch(repoName, exclude.repos)) {
reason = `\`exclude.repos\` contains ${repoName}`;
return true;
}
}
if (exclude?.topics) {
const configTopics = exclude.topics.map(topic => topic.toLowerCase());
const repoTopics = repo.topics ?? [];
const matchingTopics = repoTopics.filter((topic) => micromatch.isMatch(topic, configTopics));
if (matchingTopics.length > 0) {
reason = `\`exclude.topics\` matches the following topics: ${matchingTopics.join(', ')}`;
return true;
}
}
if (include?.topics) {
const configTopics = include.topics.map(topic => topic.toLowerCase());
const repoTopics = repo.topics ?? [];
const matchingTopics = repoTopics.filter((topic) => micromatch.isMatch(topic, configTopics));
if (matchingTopics.length === 0) {
reason = `\`include.topics\` does not match any of the following topics: ${configTopics.join(', ')}`;
return true;
}
}
const repoSizeInBytes = repo.size ? repo.size * 1000 : undefined;
if (exclude?.size && repoSizeInBytes) {
const min = exclude.size.min;
const max = exclude.size.max;
if (min && repoSizeInBytes < min) {
reason = `repo is less than \`exclude.size.min\`=${min} bytes.`;
return true;
}
if (max && repoSizeInBytes > max) {
reason = `repo is greater than \`exclude.size.max\`=${max} bytes.`;
return true;
}
}
return false;
})();
if (shouldExclude) {
logger.debug(`Excluding repo ${repoName}. Reason: ${reason}`);
return true;
}
return false;
}
const getReposOwnedByUsers = async (users: string[], isAuthenticated: boolean, octokit: Octokit, signal: AbortSignal) => {
const repos = (await Promise.all(users.map(async (user) => {

View file

@ -0,0 +1,43 @@
import { expect, test } from 'vitest';
import { shouldExcludeProject } from './gitlab';
import { ProjectSchema } from '@gitbeaker/rest';
test('shouldExcludeProject returns false when the project is not excluded.', () => {
const project = {
path_with_namespace: 'test/project',
} as ProjectSchema;
expect(shouldExcludeProject({
project,
})).toBe(false);
});
test('shouldExcludeProject returns true when the project is excluded by exclude.archived.', () => {
const project = {
path_with_namespace: 'test/project',
archived: true,
} as ProjectSchema;
expect(shouldExcludeProject({
project,
exclude: {
archived: true,
}
})).toBe(true)
});
test('shouldExcludeProject returns true when the project is excluded by exclude.forks.', () => {
const project = {
path_with_namespace: 'test/project',
forked_from_project: {}
} as unknown as ProjectSchema;
expect(shouldExcludeProject({
project,
exclude: {
forks: true,
}
})).toBe(true)
});

View file

@ -1,13 +1,12 @@
import { Gitlab, ProjectSchema } from "@gitbeaker/rest";
import { GitLabConfig } from "./schemas/v2.js";
import { excludeArchivedRepos, excludeForkedRepos, excludeReposByName, excludeReposByTopic, getTokenFromConfig, includeReposByTopic, marshalBool, measure } from "./utils.js";
import { createLogger } from "./logger.js";
import { AppContext, GitRepository } from "./types.js";
import path from 'path';
import micromatch from "micromatch";
import { createLogger } from "./logger.js";
import { GitLabConfig } from "./schemas/v2.js";
import { AppContext } from "./types.js";
import { getTokenFromConfig, marshalBool, measure } from "./utils.js";
const logger = createLogger("GitLab");
const GITLAB_CLOUD_HOSTNAME = "gitlab.com";
export const GITLAB_CLOUD_HOSTNAME = "gitlab.com";
export const getGitLabReposFromConfig = async (config: GitLabConfig, ctx: AppContext) => {
const token = config.token ? getTokenFromConfig(config.token, ctx) : undefined;
@ -94,115 +93,83 @@ export const getGitLabReposFromConfig = async (config: GitLabConfig, ctx: AppCon
allProjects = allProjects.concat(_projects);
}
let repos: GitRepository[] = allProjects
.map((project) => {
const repoId = `${hostname}/${project.path_with_namespace}`;
const repoPath = path.resolve(path.join(ctx.reposPath, `${repoId}.git`))
const isFork = project.forked_from_project !== undefined;
const cloneUrl = new URL(project.http_url_to_repo);
if (token) {
cloneUrl.username = 'oauth2';
cloneUrl.password = token;
}
return {
vcs: 'git',
codeHost: 'gitlab',
name: project.path_with_namespace,
id: repoId,
cloneUrl: cloneUrl.toString(),
path: repoPath,
isStale: false,
isFork,
isArchived: project.archived,
topics: project.topics ?? [],
gitConfigMetadata: {
'zoekt.web-url-type': 'gitlab',
'zoekt.web-url': project.web_url,
'zoekt.name': repoId,
'zoekt.gitlab-stars': project.star_count?.toString() ?? '0',
'zoekt.gitlab-forks': project.forks_count?.toString() ?? '0',
'zoekt.archived': marshalBool(project.archived),
'zoekt.fork': marshalBool(isFork),
'zoekt.public': marshalBool(project.visibility === 'public'),
let repos = allProjects
.filter((project) => {
const isExcluded = shouldExcludeProject({
project,
include: {
topics: config.topics,
},
branches: [],
tags: [],
} satisfies GitRepository;
exclude: config.exclude
});
return !isExcluded;
});
if (config.topics) {
const topics = config.topics.map(topic => topic.toLowerCase());
repos = includeReposByTopic(repos, topics, logger);
}
if (config.exclude) {
if (!!config.exclude.forks) {
repos = excludeForkedRepos(repos, logger);
}
if (!!config.exclude.archived) {
repos = excludeArchivedRepos(repos, logger);
}
if (config.exclude.projects) {
repos = excludeReposByName(repos, config.exclude.projects, logger);
}
if (config.exclude.topics) {
const topics = config.exclude.topics.map(topic => topic.toLowerCase());
repos = excludeReposByTopic(repos, topics, logger);
}
}
logger.debug(`Found ${repos.length} total repositories.`);
if (config.revisions) {
if (config.revisions.branches) {
const branchGlobs = config.revisions.branches;
repos = await Promise.all(repos.map(async (repo) => {
try {
logger.debug(`Fetching branches for repo ${repo.name}...`);
let { durationMs, data } = await measure(() => api.Branches.all(repo.name));
logger.debug(`Found ${data.length} branches in repo ${repo.name} in ${durationMs}ms.`);
let branches = data.map((branch) => branch.name);
branches = micromatch.match(branches, branchGlobs);
return {
...repo,
branches,
};
} catch (e) {
logger.error(`Failed to fetch branches for repo ${repo.name}.`, e);
return repo;
}
}));
}
if (config.revisions.tags) {
const tagGlobs = config.revisions.tags;
repos = await Promise.all(repos.map(async (repo) => {
try {
logger.debug(`Fetching tags for repo ${repo.name}...`);
let { durationMs, data } = await measure(() => api.Tags.all(repo.name));
logger.debug(`Found ${data.length} tags in repo ${repo.name} in ${durationMs}ms.`);
let tags = data.map((tag) => tag.name);
tags = micromatch.match(tags, tagGlobs);
return {
...repo,
tags,
};
} catch (e) {
logger.error(`Failed to fetch tags for repo ${repo.name}.`, e);
return repo;
}
}));
}
}
return repos;
}
export const shouldExcludeProject = ({
project,
include,
exclude,
}: {
project: ProjectSchema,
include?: {
topics?: GitLabConfig['topics'],
},
exclude?: GitLabConfig['exclude'],
}) => {
const projectName = project.path_with_namespace;
let reason = '';
const shouldExclude = (() => {
if (!!exclude?.archived && project.archived) {
reason = `\`exclude.archived\` is true`;
return true;
}
if (!!exclude?.forks && project.forked_from_project !== undefined) {
reason = `\`exclude.forks\` is true`;
return true;
}
if (exclude?.projects) {
if (micromatch.isMatch(projectName, exclude.projects)) {
reason = `\`exclude.projects\` contains ${projectName}`;
return true;
}
}
if (include?.topics) {
const configTopics = include.topics.map(topic => topic.toLowerCase());
const projectTopics = project.topics ?? [];
const matchingTopics = projectTopics.filter((topic) => micromatch.isMatch(topic, configTopics));
if (matchingTopics.length === 0) {
reason = `\`include.topics\` does not match any of the following topics: ${configTopics.join(', ')}`;
return true;
}
}
if (exclude?.topics) {
const configTopics = exclude.topics.map(topic => topic.toLowerCase());
const projectTopics = project.topics ?? [];
const matchingTopics = projectTopics.filter((topic) => micromatch.isMatch(topic, configTopics));
if (matchingTopics.length > 0) {
reason = `\`exclude.topics\` matches the following topics: ${matchingTopics.join(', ')}`;
return true;
}
}
})();
if (shouldExclude) {
logger.debug(`Excluding project ${projectName}. Reason: ${reason}`);
return true;
}
return false;
}

View file

@ -5,6 +5,7 @@ import path from 'path';
import { isRemotePath } from "./utils.js";
import { AppContext } from "./types.js";
import { main } from "./main.js"
import { PrismaClient } from "@sourcebot/db";
const parser = new ArgumentParser({
@ -50,6 +51,17 @@ const context: AppContext = {
configPath: args.configPath,
}
main(context).finally(() => {
console.log("Shutting down...");
});
const prisma = new PrismaClient();
main(prisma, context)
.then(async () => {
await prisma.$disconnect();
})
.catch(async (e) => {
console.error(e);
await prisma.$disconnect();
process.exit(1);
})
.finally(() => {
console.log("Shutting down...");
});

View file

@ -1,206 +0,0 @@
import { expect, test, vi } from 'vitest';
import { deleteStaleRepository, isAllRepoReindexingRequired, isRepoReindexingRequired } from './main';
import { AppContext, GitRepository, LocalRepository, Repository, Settings } from './types';
import { DEFAULT_DB_DATA } from './db';
import { createMockDB } from './db.test';
import { rm } from 'fs/promises';
import path from 'path';
import { glob } from 'glob';
vi.mock('fs/promises', () => ({
rm: vi.fn(),
}));
vi.mock('glob', () => ({
glob: vi.fn().mockReturnValue(['fake_index.zoekt']),
}));
vi.mock('fs', () => ({
existsSync: vi.fn().mockReturnValue(true),
}));
const createMockContext = (rootPath: string = '/app') => {
return {
configPath: path.join(rootPath, 'config.json'),
cachePath: path.join(rootPath, '.sourcebot'),
indexPath: path.join(rootPath, '.sourcebot/index'),
reposPath: path.join(rootPath, '.sourcebot/repos'),
} satisfies AppContext;
}
test('isRepoReindexingRequired should return false when no changes are made', () => {
const previous: Repository = {
vcs: 'git',
name: 'test',
id: 'test',
path: '',
cloneUrl: '',
isStale: false,
branches: ['main'],
tags: ['v1.0'],
};
const current = previous;
expect(isRepoReindexingRequired(previous, current)).toBe(false);
})
test('isRepoReindexingRequired should return true when git branches change', () => {
const previous: Repository = {
vcs: 'git',
name: 'test',
id: 'test',
path: '',
cloneUrl: '',
isStale: false,
branches: ['main'],
tags: ['v1.0'],
};
const current: Repository = {
...previous,
branches: ['main', 'feature']
};
expect(isRepoReindexingRequired(previous, current)).toBe(true);
});
test('isRepoReindexingRequired should return true when git tags change', () => {
const previous: Repository = {
vcs: 'git',
name: 'test',
id: 'test',
path: '',
cloneUrl: '',
isStale: false,
branches: ['main'],
tags: ['v1.0'],
};
const current: Repository = {
...previous,
tags: ['v1.0', 'v2.0']
};
expect(isRepoReindexingRequired(previous, current)).toBe(true);
});
test('isRepoReindexingRequired should return true when local excludedPaths change', () => {
const previous: Repository = {
vcs: 'local',
name: 'test',
id: 'test',
path: '/',
isStale: false,
excludedPaths: ['node_modules'],
watch: false,
};
const current: Repository = {
...previous,
excludedPaths: ['node_modules', 'dist']
};
expect(isRepoReindexingRequired(previous, current)).toBe(true);
});
test('isAllRepoReindexingRequired should return false when fileLimitSize has not changed', () => {
const previous: Settings = {
maxFileSize: 1000,
autoDeleteStaleRepos: true,
}
const current: Settings = {
...previous,
}
expect(isAllRepoReindexingRequired(previous, current)).toBe(false);
});
test('isAllRepoReindexingRequired should return true when fileLimitSize has changed', () => {
const previous: Settings = {
maxFileSize: 1000,
autoDeleteStaleRepos: true,
}
const current: Settings = {
...previous,
maxFileSize: 2000,
}
expect(isAllRepoReindexingRequired(previous, current)).toBe(true);
});
test('isAllRepoReindexingRequired should return false when autoDeleteStaleRepos has changed', () => {
const previous: Settings = {
maxFileSize: 1000,
autoDeleteStaleRepos: true,
}
const current: Settings = {
...previous,
autoDeleteStaleRepos: false,
}
expect(isAllRepoReindexingRequired(previous, current)).toBe(false);
});
test('deleteStaleRepository can delete a git repository', async () => {
const ctx = createMockContext();
const repo: GitRepository = {
id: 'github.com/sourcebot-dev/sourcebot',
vcs: 'git',
name: 'sourcebot',
cloneUrl: 'https://github.com/sourcebot-dev/sourcebot',
path: `${ctx.reposPath}/github.com/sourcebot-dev/sourcebot`,
branches: ['main'],
tags: [''],
isStale: true,
}
const db = createMockDB({
...DEFAULT_DB_DATA,
repos: {
'github.com/sourcebot-dev/sourcebot': repo,
}
});
await deleteStaleRepository(repo, db, ctx);
expect(db.data.repos['github.com/sourcebot-dev/sourcebot']).toBeUndefined();
expect(rm).toHaveBeenCalledWith(`${ctx.reposPath}/github.com/sourcebot-dev/sourcebot`, {
recursive: true,
});
expect(glob).toHaveBeenCalledWith(`github.com%2Fsourcebot-dev%2Fsourcebot*.zoekt`, {
cwd: ctx.indexPath,
absolute: true
});
expect(rm).toHaveBeenCalledWith(`fake_index.zoekt`);
});
test('deleteStaleRepository can delete a local repository', async () => {
const ctx = createMockContext();
const repo: LocalRepository = {
vcs: 'local',
name: 'UnrealEngine',
id: '/path/to/UnrealEngine',
path: '/path/to/UnrealEngine',
watch: false,
excludedPaths: [],
isStale: true,
}
const db = createMockDB({
...DEFAULT_DB_DATA,
repos: {
'/path/to/UnrealEngine': repo,
}
});
await deleteStaleRepository(repo, db, ctx);
expect(db.data.repos['/path/to/UnrealEngine']).toBeUndefined();
expect(rm).not.toHaveBeenCalledWith('/path/to/UnrealEngine');
expect(glob).toHaveBeenCalledWith(`UnrealEngine*.zoekt`, {
cwd: ctx.indexPath,
absolute: true
});
expect(rm).toHaveBeenCalledWith('fake_index.zoekt');
});

View file

@ -1,44 +1,38 @@
import { readFile, rm } from 'fs/promises';
import { PrismaClient, Repo } from '@sourcebot/db';
import { existsSync, watch } from 'fs';
import { SourcebotConfigurationSchema } from "./schemas/v2.js";
import { getGitHubReposFromConfig } from "./github.js";
import { getGitLabReposFromConfig } from "./gitlab.js";
import { getGiteaReposFromConfig } from "./gitea.js";
import { getGerritReposFromConfig } from "./gerrit.js";
import { AppContext, LocalRepository, GitRepository, Repository, Settings } from "./types.js";
import { cloneRepository, fetchRepository, getGitRepoFromConfig } from "./git.js";
import { syncConfig } from "./config.js";
import { cloneRepository, fetchRepository } from "./git.js";
import { createLogger } from "./logger.js";
import { createRepository, Database, loadDB, updateRepository, updateSettings } from './db.js';
import { arraysEqualShallow, isRemotePath, measure } from "./utils.js";
import { DEFAULT_SETTINGS } from "./constants.js";
import stripJsonComments from 'strip-json-comments';
import { indexGitRepository, indexLocalRepository } from "./zoekt.js";
import { getLocalRepoFromConfig, initLocalRepoFileWatchers } from "./local.js";
import { captureEvent } from "./posthog.js";
import { glob } from 'glob';
import path from 'path';
import { AppContext } from "./types.js";
import { getRepoPath, isRemotePath, measure } from "./utils.js";
import { indexGitRepository } from "./zoekt.js";
import { DEFAULT_SETTINGS } from './constants.js';
const logger = createLogger('main');
const syncGitRepository = async (repo: GitRepository, settings: Settings, ctx: AppContext) => {
const syncGitRepository = async (repo: Repo, ctx: AppContext) => {
let fetchDuration_s: number | undefined = undefined;
let cloneDuration_s: number | undefined = undefined;
if (existsSync(repo.path)) {
const repoPath = getRepoPath(repo, ctx);
const metadata = repo.metadata as Record<string, string>;
if (existsSync(repoPath)) {
logger.info(`Fetching ${repo.id}...`);
const { durationMs } = await measure(() => fetchRepository(repo, ({ method, stage , progress}) => {
const { durationMs } = await measure(() => fetchRepository(repoPath, ({ method, stage, progress }) => {
logger.info(`git.${method} ${stage} stage ${progress}% complete for ${repo.id}`)
}));
fetchDuration_s = durationMs / 1000;
process.stdout.write('\n');
logger.info(`Fetched ${repo.id} in ${fetchDuration_s}s`);
logger.info(`Fetched ${repo.name} in ${fetchDuration_s}s`);
} else {
logger.info(`Cloning ${repo.id}...`);
const { durationMs } = await measure(() => cloneRepository(repo, ({ method, stage, progress }) => {
const { durationMs } = await measure(() => cloneRepository(repo.cloneUrl, repoPath, metadata, ({ method, stage, progress }) => {
logger.info(`git.${method} ${stage} stage ${progress}% complete for ${repo.id}`)
}));
cloneDuration_s = durationMs / 1000;
@ -48,7 +42,7 @@ const syncGitRepository = async (repo: GitRepository, settings: Settings, ctx: A
}
logger.info(`Indexing ${repo.id}...`);
const { durationMs } = await measure(() => indexGitRepository(repo, settings, ctx));
const { durationMs } = await measure(() => indexGitRepository(repo, ctx));
const indexDuration_s = durationMs / 1000;
logger.info(`Indexed ${repo.id} in ${indexDuration_s}s`);
@ -59,262 +53,7 @@ const syncGitRepository = async (repo: GitRepository, settings: Settings, ctx: A
}
}
const syncLocalRepository = async (repo: LocalRepository, settings: Settings, ctx: AppContext, signal?: AbortSignal) => {
logger.info(`Indexing ${repo.id}...`);
const { durationMs } = await measure(() => indexLocalRepository(repo, settings, ctx, signal));
const indexDuration_s = durationMs / 1000;
logger.info(`Indexed ${repo.id} in ${indexDuration_s}s`);
return {
indexDuration_s,
}
}
export const deleteStaleRepository = async (repo: Repository, db: Database, ctx: AppContext) => {
logger.info(`Deleting stale repository ${repo.id}:`);
// Delete the checked out git repository (if applicable)
if (repo.vcs === "git" && existsSync(repo.path)) {
logger.info(`\tDeleting git directory ${repo.path}...`);
await rm(repo.path, {
recursive: true,
});
}
// Delete all .zoekt index files
{
// .zoekt index files are named with the repository name,
// index version, and shard number. Some examples:
//
// git repos:
// github.com%2Fsourcebot-dev%2Fsourcebot_v16.00000.zoekt
// gitlab.com%2Fmy-org%2Fmy-project.00000.zoekt
//
// local repos:
// UnrealEngine_v16.00000.zoekt
// UnrealEngine_v16.00001.zoekt
// ...
// UnrealEngine_v16.00016.zoekt
//
// Notice that local repos are named with the repository basename and
// git repos are named with the query-encoded repository name. Form a
// glob pattern with the correct prefix & suffix to match the correct
// index file(s) for the repository.
//
// @see : https://github.com/sourcegraph/zoekt/blob/c03b77fbf18b76904c0e061f10f46597eedd7b14/build/builder.go#L348
const indexFilesGlobPattern = (() => {
switch (repo.vcs) {
case 'git':
return `${encodeURIComponent(repo.id)}*.zoekt`;
case 'local':
return `${path.basename(repo.path)}*.zoekt`;
}
})();
const indexFiles = await glob(indexFilesGlobPattern, {
cwd: ctx.indexPath,
absolute: true
});
await Promise.all(indexFiles.map((file) => {
if (!existsSync(file)) {
return;
}
logger.info(`\tDeleting index file ${file}...`);
return rm(file);
}));
}
// Delete db entry
logger.info(`\tDeleting db entry...`);
await db.update(({ repos }) => {
delete repos[repo.id];
});
logger.info(`Deleted stale repository ${repo.id}`);
captureEvent('repo_deleted', {
vcs: repo.vcs,
codeHost: repo.codeHost,
})
}
/**
* Certain configuration changes (e.g., a branch is added) require
* a reindexing of the repository.
*/
export const isRepoReindexingRequired = (previous: Repository, current: Repository) => {
/**
* Checks if the any of the `revisions` properties have changed.
*/
const isRevisionsChanged = () => {
if (previous.vcs !== 'git' || current.vcs !== 'git') {
return false;
}
return (
!arraysEqualShallow(previous.branches, current.branches) ||
!arraysEqualShallow(previous.tags, current.tags)
);
}
/**
* Check if the `exclude.paths` property has changed.
*/
const isExcludePathsChanged = () => {
if (previous.vcs !== 'local' || current.vcs !== 'local') {
return false;
}
return !arraysEqualShallow(previous.excludedPaths, current.excludedPaths);
}
return (
isRevisionsChanged() ||
isExcludePathsChanged()
)
}
/**
* Certain settings changes (e.g., the file limit size is changed) require
* a reindexing of _all_ repositories.
*/
export const isAllRepoReindexingRequired = (previous: Settings, current: Settings) => {
return (
previous?.maxFileSize !== current?.maxFileSize
)
}
const syncConfig = async (configPath: string, db: Database, signal: AbortSignal, ctx: AppContext) => {
const configContent = await (async () => {
if (isRemotePath(configPath)) {
const response = await fetch(configPath, {
signal,
});
if (!response.ok) {
throw new Error(`Failed to fetch config file ${configPath}: ${response.statusText}`);
}
return response.text();
} else {
return readFile(configPath, {
encoding: 'utf-8',
signal,
});
}
})();
// @todo: we should validate the configuration file's structure here.
const config = JSON.parse(stripJsonComments(configContent)) as SourcebotConfigurationSchema;
// Update the settings
const updatedSettings: Settings = {
maxFileSize: config.settings?.maxFileSize ?? DEFAULT_SETTINGS.maxFileSize,
autoDeleteStaleRepos: config.settings?.autoDeleteStaleRepos ?? DEFAULT_SETTINGS.autoDeleteStaleRepos,
reindexInterval: config.settings?.reindexInterval ?? DEFAULT_SETTINGS.reindexInterval,
resyncInterval: config.settings?.resyncInterval ?? DEFAULT_SETTINGS.resyncInterval,
}
const _isAllRepoReindexingRequired = isAllRepoReindexingRequired(db.data.settings, updatedSettings);
await updateSettings(updatedSettings, db);
// Fetch all repositories from the config file
let configRepos: Repository[] = [];
for (const repoConfig of config.repos ?? []) {
switch (repoConfig.type) {
case 'github': {
const gitHubRepos = await getGitHubReposFromConfig(repoConfig, signal, ctx);
configRepos.push(...gitHubRepos);
break;
}
case 'gitlab': {
const gitLabRepos = await getGitLabReposFromConfig(repoConfig, ctx);
configRepos.push(...gitLabRepos);
break;
}
case 'gitea': {
const giteaRepos = await getGiteaReposFromConfig(repoConfig, ctx);
configRepos.push(...giteaRepos);
break;
}
case 'gerrit': {
const gerritRepos = await getGerritReposFromConfig(repoConfig, ctx);
configRepos.push(...gerritRepos);
break;
}
case 'local': {
const repo = getLocalRepoFromConfig(repoConfig, ctx);
configRepos.push(repo);
break;
}
case 'git': {
const gitRepo = await getGitRepoFromConfig(repoConfig, ctx);
gitRepo && configRepos.push(gitRepo);
break;
}
}
}
// De-duplicate on id
configRepos.sort((a, b) => {
return a.id.localeCompare(b.id);
});
configRepos = configRepos.filter((item, index, self) => {
if (index === 0) return true;
if (item.id === self[index - 1].id) {
logger.debug(`Duplicate repository ${item.id} found in config file.`);
return false;
}
return true;
});
logger.info(`Discovered ${configRepos.length} unique repositories from config.`);
// Merge the repositories into the database
for (const newRepo of configRepos) {
if (newRepo.id in db.data.repos) {
const existingRepo = db.data.repos[newRepo.id];
const isReindexingRequired = _isAllRepoReindexingRequired || isRepoReindexingRequired(existingRepo, newRepo);
if (isReindexingRequired) {
logger.info(`Marking ${newRepo.id} for reindexing due to configuration change.`);
}
await updateRepository(existingRepo.id, {
...newRepo,
...(isReindexingRequired ? {
lastIndexedDate: undefined,
}: {})
}, db);
} else {
await createRepository(newRepo, db);
captureEvent("repo_created", {
vcs: newRepo.vcs,
codeHost: newRepo.codeHost,
});
}
}
// Find repositories that are in the database, but not in the configuration file
{
const a = configRepos.map(repo => repo.id);
const b = Object.keys(db.data.repos);
const diff = b.filter(x => !a.includes(x));
for (const id of diff) {
await db.update(({ repos }) => {
const repo = repos[id];
if (repo.isStale) {
return;
}
logger.warn(`Repository ${id} is no longer listed in the configuration file or was not found. Marking as stale.`);
repo.isStale = true;
});
}
}
}
export const main = async (context: AppContext) => {
const db = await loadDB(context);
export const main = async (db: PrismaClient, context: AppContext) => {
let abortController = new AbortController();
let isSyncing = false;
const _syncConfig = async () => {
@ -340,13 +79,6 @@ export const main = async (context: AppContext) => {
console.log(err);
}
}
const localRepos = Object.values(db.data.repos).filter(repo => repo.vcs === 'local');
initLocalRepoFileWatchers(localRepos, async (repo, signal) => {
logger.info(`Change detected to local repository ${repo.id}. Re-syncing...`);
await syncLocalRepository(repo, db.data.settings, context, signal);
await db.update(({ repos }) => repos[repo.id].lastIndexedDate = new Date().toUTCString());
});
}
// Re-sync on file changes if the config file is local
@ -360,27 +92,18 @@ export const main = async (context: AppContext) => {
// Re-sync at a fixed interval
setInterval(() => {
_syncConfig();
}, db.data.settings.resyncInterval);
}, DEFAULT_SETTINGS.resyncIntervalMs);
// Sync immediately on startup
await _syncConfig();
while (true) {
const repos = db.data.repos;
const repos = await db.repo.findMany();
for (const [_, repo] of Object.entries(repos)) {
const lastIndexed = repo.lastIndexedDate ? new Date(repo.lastIndexedDate) : new Date(0);
for (const repo of repos) {
const lastIndexed = repo.indexedAt ?? new Date(0);
if (repo.isStale) {
if (db.data.settings.autoDeleteStaleRepos) {
await deleteStaleRepository(repo, db, context);
} else {
// skip deletion...
}
continue;
}
if (lastIndexed.getTime() > (Date.now() - db.data.settings.reindexInterval)) {
if (lastIndexed.getTime() > (Date.now() - DEFAULT_SETTINGS.reindexIntervalMs)) {
continue;
}
@ -389,19 +112,14 @@ export const main = async (context: AppContext) => {
let fetchDuration_s: number | undefined;
let cloneDuration_s: number | undefined;
if (repo.vcs === 'git') {
const stats = await syncGitRepository(repo, db.data.settings, context);
indexDuration_s = stats.indexDuration_s;
fetchDuration_s = stats.fetchDuration_s;
cloneDuration_s = stats.cloneDuration_s;
} else if (repo.vcs === 'local') {
const stats = await syncLocalRepository(repo, db.data.settings, context);
indexDuration_s = stats.indexDuration_s;
}
const stats = await syncGitRepository(repo, context);
indexDuration_s = stats.indexDuration_s;
fetchDuration_s = stats.fetchDuration_s;
cloneDuration_s = stats.cloneDuration_s;
captureEvent('repo_synced', {
vcs: repo.vcs,
codeHost: repo.codeHost,
vcs: 'git',
codeHost: repo.external_codeHostType,
indexDuration_s,
fetchDuration_s,
cloneDuration_s,
@ -412,7 +130,14 @@ export const main = async (context: AppContext) => {
continue;
}
await db.update(({ repos }) => repos[repo.id].lastIndexedDate = new Date().toUTCString());
await db.repo.update({
where: {
id: repo.id,
},
data: {
indexedAt: new Date(),
}
});
}
await new Promise(resolve => setTimeout(resolve, 1000));

View file

@ -71,7 +71,7 @@ export interface GitHubConfig {
*
* @minItems 1
*/
topics?: [string, ...string[]];
topics?: string[];
exclude?: {
/**
* Exclude forked repositories from syncing.
@ -159,7 +159,7 @@ export interface GitLabConfig {
*
* @minItems 1
*/
topics?: [string, ...string[]];
topics?: string[];
exclude?: {
/**
* Exclude forked projects from syncing.

View file

@ -1,3 +1,6 @@
/**
* @deprecated in V3
*/
interface BaseRepository {
vcs: 'git' | 'local';
id: string;
@ -12,6 +15,9 @@ interface BaseRepository {
sizeInBytes?: number;
}
/**
* @deprecated in V3
*/
export interface GitRepository extends BaseRepository {
vcs: 'git';
cloneUrl: string;
@ -20,12 +26,18 @@ export interface GitRepository extends BaseRepository {
gitConfigMetadata?: Record<string, string>;
}
/**
* @deprecated in V3
*/
export interface LocalRepository extends BaseRepository {
vcs: 'local';
excludedPaths: string[];
watch: boolean;
}
/**
* @deprecated in V3
*/
export type Repository = GitRepository | LocalRepository;
export type AppContext = {
@ -56,11 +68,11 @@ export type Settings = {
/**
* The interval (in milliseconds) at which the indexer should re-index all repositories.
*/
reindexInterval: number;
reindexIntervalMs: number;
/**
* The interval (in milliseconds) at which the configuration file should be re-synced.
*/
resyncInterval: number;
resyncIntervalMs: number;
}
// @see : https://stackoverflow.com/a/61132308

View file

@ -2,6 +2,7 @@ import { Logger } from "winston";
import { AppContext, Repository } from "./types.js";
import path from 'path';
import micromatch from "micromatch";
import { Repo } from "@sourcebot/db";
export const measure = async <T>(cb : () => Promise<T>) => {
const start = Date.now();
@ -129,3 +130,7 @@ export const arraysEqualShallow = <T>(a?: readonly T[], b?: readonly T[]) => {
return true;
}
export const getRepoPath = (repo: Repo, ctx: AppContext) => {
return path.join(ctx.reposPath, repo.id.toString());
}

View file

@ -1,16 +1,18 @@
import { exec } from "child_process";
import { AppContext, GitRepository, LocalRepository, Settings } from "./types.js";
import { AppContext, LocalRepository, Settings } from "./types.js";
import { Repo } from "@sourcebot/db";
import { getRepoPath } from "./utils.js";
import { DEFAULT_SETTINGS } from "./constants.js";
const ALWAYS_EXCLUDED_DIRS = ['.git', '.hg', '.svn'];
export const indexGitRepository = async (repo: GitRepository, settings: Settings, ctx: AppContext) => {
export const indexGitRepository = async (repo: Repo, ctx: AppContext) => {
const revisions = [
'HEAD',
...repo.branches ?? [],
...repo.tags ?? [],
'HEAD'
];
const command = `zoekt-git-index -allow_missing_branches -index ${ctx.indexPath} -file_limit ${settings.maxFileSize} -branches ${revisions.join(',')} ${repo.path}`;
const repoPath = getRepoPath(repo, ctx);
const command = `zoekt-git-index -allow_missing_branches -index ${ctx.indexPath} -file_limit ${DEFAULT_SETTINGS.maxFileSize} -branches ${revisions.join(',')} -shard_prefix ${repo.id} ${repoPath}`;
return new Promise<{ stdout: string, stderr: string }>((resolve, reject) => {
exec(command, (error, stdout, stderr) => {

View file

@ -12,6 +12,7 @@ const BANNER_COMMENT = '// THIS IS A AUTO-GENERATED FILE. DO NOT MODIFY MANUALLY
const content = await compileFromFile(schemaPath, {
bannerComment: BANNER_COMMENT,
cwd,
ignoreMinAndMaxItems: true,
});
await fs.promises.writeFile(

1
packages/db/.env Normal file
View file

@ -0,0 +1 @@
DATABASE_URL=file:../../../.sourcebot/db.sqlite

3
packages/db/.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
node_modules
!.env

1
packages/db/README.md Normal file
View file

@ -0,0 +1 @@
This package contains the database schema (prisma/schema.prisma), migrations (prisma/migrations) and the client library for interacting with the database. Before making edits to the schema, please read about prisma's [migration model](https://www.prisma.io/docs/orm/prisma-migrate/understanding-prisma-migrate/mental-model) to get an idea of how migrations work.

24
packages/db/package.json Normal file
View file

@ -0,0 +1,24 @@
{
"name": "@sourcebot/db",
"version": "0.1.0",
"main": "dist/index.js",
"private": true,
"scripts": {
"prisma:generate": "prisma generate",
"prisma:generate:watch": "prisma generate --watch",
"prisma:migrate:dev": "prisma migrate dev",
"prisma:migrate:prod": "prisma migrate deploy",
"prisma:migrate:reset": "prisma migrate reset",
"prisma:db:push": "prisma db push",
"prisma:studio": "prisma studio",
"build": "yarn prisma:generate && tsc",
"postinstall": "yarn build"
},
"devDependencies": {
"prisma": "^6.2.1",
"typescript": "^5.7.3"
},
"dependencies": {
"@prisma/client": "6.2.1"
}
}

View file

@ -0,0 +1,18 @@
-- CreateTable
CREATE TABLE "Repo" (
"id" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
"name" TEXT NOT NULL,
"createdAt" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
"updatedAt" DATETIME NOT NULL,
"indexedAt" DATETIME,
"isFork" BOOLEAN NOT NULL,
"isArchived" BOOLEAN NOT NULL,
"metadata" JSONB NOT NULL,
"cloneUrl" TEXT NOT NULL,
"external_id" TEXT NOT NULL,
"external_codeHostType" TEXT NOT NULL,
"external_codeHostUrl" TEXT NOT NULL
);
-- CreateIndex
CREATE UNIQUE INDEX "Repo_external_id_external_codeHostUrl_key" ON "Repo"("external_id", "external_codeHostUrl");

View file

@ -0,0 +1,3 @@
# Please do not edit this file manually
# It should be added in your version-control system (e.g., Git)
provider = "sqlite"

View file

@ -0,0 +1,32 @@
// This is your Prisma schema file,
// learn more about it in the docs: https://pris.ly/d/prisma-schema
generator client {
provider = "prisma-client-js"
}
datasource db {
provider = "sqlite"
url = env("DATABASE_URL")
}
model Repo {
id Int @id @default(autoincrement())
name String
createdAt DateTime @default(now())
updatedAt DateTime @updatedAt
indexedAt DateTime?
isFork Boolean
isArchived Boolean
metadata Json
cloneUrl String
// The id of the repo in the external service
external_id String
// The type of the external service (e.g., github, gitlab, etc.)
external_codeHostType String
// The base url of the external service (e.g., https://github.com)
external_codeHostUrl String
@@unique([external_id, external_codeHostUrl])
}

1
packages/db/src/index.ts Normal file
View file

@ -0,0 +1 @@
export * from ".prisma/client";

25
packages/db/tsconfig.json Normal file
View file

@ -0,0 +1,25 @@
{
"compilerOptions": {
"outDir": "dist",
"incremental": true,
"declaration": true,
"emitDecoratorMetadata": true,
"esModuleInterop": true,
"experimentalDecorators": true,
"forceConsistentCasingInFileNames": true,
"isolatedModules": false,
"module": "CommonJS",
"moduleResolution": "node",
"noEmitOnError": false,
"noImplicitAny": true,
"noUnusedLocals": true,
"pretty": true,
"resolveJsonModule": true,
"skipLibCheck": true,
"strict": true,
"sourceMap": true,
"target": "ES2017",
},
"include": ["src/index.ts", "src/index.d.ts"],
"exclude": ["node_modules"]
}

View file

@ -1240,6 +1240,47 @@
resolved "https://registry.yarnpkg.com/@pkgjs/parseargs/-/parseargs-0.11.0.tgz#a77ea742fab25775145434eb1d2328cf5013ac33"
integrity sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==
"@prisma/client@6.2.1":
version "6.2.1"
resolved "https://registry.yarnpkg.com/@prisma/client/-/client-6.2.1.tgz#3d7d0c8669bba490247e1ffff67b93a516bd789f"
integrity sha512-msKY2iRLISN8t5X0Tj7hU0UWet1u0KuxSPHWuf3IRkB4J95mCvGpyQBfQ6ufcmvKNOMQSq90O2iUmJEN2e5fiA==
"@prisma/debug@6.2.1":
version "6.2.1"
resolved "https://registry.yarnpkg.com/@prisma/debug/-/debug-6.2.1.tgz#887719967c4942d125262e48f6c47c45d17c1f61"
integrity sha512-0KItvt39CmQxWkEw6oW+RQMD6RZ43SJWgEUnzxN8VC9ixMysa7MzZCZf22LCK5DSooiLNf8vM3LHZm/I/Ni7bQ==
"@prisma/engines-version@6.2.0-14.4123509d24aa4dede1e864b46351bf2790323b69":
version "6.2.0-14.4123509d24aa4dede1e864b46351bf2790323b69"
resolved "https://registry.yarnpkg.com/@prisma/engines-version/-/engines-version-6.2.0-14.4123509d24aa4dede1e864b46351bf2790323b69.tgz#b84ce3fab44bfa13a22669da02752330b61745b2"
integrity sha512-7tw1qs/9GWSX6qbZs4He09TOTg1ff3gYsB3ubaVNN0Pp1zLm9NC5C5MZShtkz7TyQjx7blhpknB7HwEhlG+PrQ==
"@prisma/engines@6.2.1":
version "6.2.1"
resolved "https://registry.yarnpkg.com/@prisma/engines/-/engines-6.2.1.tgz#14ef56bb780f02871a728667161d997a14aedb69"
integrity sha512-lTBNLJBCxVT9iP5I7Mn6GlwqAxTpS5qMERrhebkUhtXpGVkBNd/jHnNJBZQW4kGDCKaQg/r2vlJYkzOHnAb7ZQ==
dependencies:
"@prisma/debug" "6.2.1"
"@prisma/engines-version" "6.2.0-14.4123509d24aa4dede1e864b46351bf2790323b69"
"@prisma/fetch-engine" "6.2.1"
"@prisma/get-platform" "6.2.1"
"@prisma/fetch-engine@6.2.1":
version "6.2.1"
resolved "https://registry.yarnpkg.com/@prisma/fetch-engine/-/fetch-engine-6.2.1.tgz#cd7eb7428a407105e0f3761dba536aefd41fc7f7"
integrity sha512-OO7O9d6Mrx2F9i+Gu1LW+DGXXyUFkP7OE5aj9iBfA/2jjDXEJjqa9X0ZmM9NZNo8Uo7ql6zKm6yjDcbAcRrw1A==
dependencies:
"@prisma/debug" "6.2.1"
"@prisma/engines-version" "6.2.0-14.4123509d24aa4dede1e864b46351bf2790323b69"
"@prisma/get-platform" "6.2.1"
"@prisma/get-platform@6.2.1":
version "6.2.1"
resolved "https://registry.yarnpkg.com/@prisma/get-platform/-/get-platform-6.2.1.tgz#34313cd0ee3587798ad33a7b57b6342dc8e66426"
integrity sha512-zp53yvroPl5m5/gXYLz7tGCNG33bhG+JYCm74ohxOq1pPnrL47VQYFfF3RbTZ7TzGWCrR3EtoiYMywUBw7UK6Q==
dependencies:
"@prisma/debug" "6.2.1"
"@radix-ui/number@1.1.0":
version "1.1.0"
resolved "https://registry.yarnpkg.com/@radix-ui/number/-/number-1.1.0.tgz#1e95610461a09cdf8bb05c152e76ca1278d5da46"
@ -3620,7 +3661,7 @@ fs.realpath@^1.0.0:
resolved "https://registry.yarnpkg.com/fs.realpath/-/fs.realpath-1.0.0.tgz#1504ad2523158caa40db4a2787cb01411994ea4f"
integrity sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==
fsevents@~2.3.2, fsevents@~2.3.3:
fsevents@2.3.3, fsevents@~2.3.2, fsevents@~2.3.3:
version "2.3.3"
resolved "https://registry.yarnpkg.com/fsevents/-/fsevents-2.3.3.tgz#cac6407785d03675a2a5e1a5305c697b347d90d6"
integrity sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==
@ -4994,6 +5035,15 @@ pretty-bytes@^6.1.1:
resolved "https://registry.yarnpkg.com/pretty-bytes/-/pretty-bytes-6.1.1.tgz#38cd6bb46f47afbf667c202cfc754bffd2016a3b"
integrity sha512-mQUvGU6aUFQ+rNvTIAcZuWGRT9a6f6Yrg9bHs4ImKF+HZCEK+plBvnAZYSIQztknZF2qnzNtr6F8s0+IuptdlQ==
prisma@^6.2.1:
version "6.2.1"
resolved "https://registry.yarnpkg.com/prisma/-/prisma-6.2.1.tgz#457b210326d66d0e6f583cc6f9cd2819b984408f"
integrity sha512-hhyM0H13pQleQ+br4CkzGizS5I0oInoeTw3JfLw1BRZduBSQxPILlJLwi+46wZzj9Je7ndyQEMGw/n5cN2fknA==
dependencies:
"@prisma/engines" "6.2.1"
optionalDependencies:
fsevents "2.3.3"
process@^0.11.10:
version "0.11.10"
resolved "https://registry.yarnpkg.com/process/-/process-0.11.10.tgz#7332300e840161bda3e69a1d1d91a7d4bc16f182"
@ -5988,6 +6038,11 @@ typescript@^5, typescript@^5.6.2:
resolved "https://registry.yarnpkg.com/typescript/-/typescript-5.6.2.tgz#d1de67b6bef77c41823f822df8f0b3bcff60a5a0"
integrity sha512-NW8ByodCSNCwZeghjN3o+JX5OFH0Ojg6sadjEKY4huZ52TqbJTJnDo5+Tw98lSy63NZvi4n+ez5m2u5d4PkZyw==
typescript@^5.7.3:
version "5.7.3"
resolved "https://registry.yarnpkg.com/typescript/-/typescript-5.7.3.tgz#919b44a7dbb8583a9b856d162be24a54bf80073e"
integrity sha512-84MVSjMEHP+FQRPy3pX9sTVV/INIex71s9TL2Gm5FG/WG1SqXeKyZ0k7/blY/4FdOzI12CBy1vGc4og/eus0fw==
unbox-primitive@^1.0.2:
version "1.0.2"
resolved "https://registry.yarnpkg.com/unbox-primitive/-/unbox-primitive-1.0.2.tgz#29032021057d5e6cdbd08c5129c226dff8ed6f9e"