Compare commits

..

No commits in common. "main" and "v4.7.3" have entirely different histories.
main ... v4.7.3

508 changed files with 14274 additions and 29785 deletions

View file

@ -4,8 +4,12 @@ DATABASE_URL="postgresql://postgres:postgres@localhost:5432/postgres"
# Zoekt
ZOEKT_WEBSERVER_URL="http://localhost:6070"
# SHARD_MAX_MATCH_COUNT=10000
# TOTAL_MAX_MATCH_COUNT=100000
# The command to use for generating ctags.
CTAGS_COMMAND=ctags
# logging, strict
SRC_TENANT_ENFORCEMENT_MODE=strict
# Auth.JS
# You can generate a new secret with:
@ -21,7 +25,7 @@ AUTH_URL="http://localhost:3000"
DATA_CACHE_DIR=${PWD}/.sourcebot # Path to the sourcebot cache dir (ex. ~/sourcebot/.sourcebot)
SOURCEBOT_PUBLIC_KEY_PATH=${PWD}/public.pem
CONFIG_PATH=${PWD}/config.json # Path to the sourcebot config file (if one exists)
# CONFIG_PATH=${PWD}/config.json # Path to the sourcebot config file (if one exists)
# Email
# EMAIL_FROM_ADDRESS="" # The from address for transactional emails.
@ -29,6 +33,7 @@ CONFIG_PATH=${PWD}/config.json # Path to the sourcebot config file (if one exist
# PostHog
# POSTHOG_PAPIK=""
# NEXT_PUBLIC_POSTHOG_PAPIK=""
# Sentry
# SENTRY_BACKEND_DSN=""
@ -77,11 +82,14 @@ SOURCEBOT_TELEMETRY_DISABLED=true # Disables telemetry collection
# Controls the number of concurrent indexing jobs that can run at once
# INDEX_CONCURRENCY_MULTIPLE=
# Controls the polling interval for the web app
# NEXT_PUBLIC_POLLING_INTERVAL_MS=
# Controls the version of the web app
# NEXT_PUBLIC_SOURCEBOT_VERSION=
# CONFIG_MAX_REPOS_NO_TOKEN=
NODE_ENV=development
# NODE_ENV=
# SOURCEBOT_TENANCY_MODE=single
# NEXT_PUBLIC_SOURCEBOT_CLOUD_ENVIRONMENT=

View file

@ -1,4 +1,4 @@
contact_links:
- name: 👾 Discord
url: https://discord.gg/HDScTs3ptP
url: https://discord.gg/f4Cbf3HT
about: Something else? Join the Discord!

View file

@ -55,6 +55,7 @@ jobs:
${{ env.IMAGE_PATH }}:latest
build-args: |
NEXT_PUBLIC_SOURCEBOT_VERSION=${{ github.ref_name }}
NEXT_PUBLIC_POSTHOG_PAPIK=${{ vars.NEXT_PUBLIC_POSTHOG_PAPIK }}
NEXT_PUBLIC_SOURCEBOT_CLOUD_ENVIRONMENT=${{ vars.NEXT_PUBLIC_SOURCEBOT_CLOUD_ENVIRONMENT }}
NEXT_PUBLIC_SENTRY_ENVIRONMENT=${{ vars.NEXT_PUBLIC_SENTRY_ENVIRONMENT }}
NEXT_PUBLIC_SENTRY_WEBAPP_DSN=${{ vars.NEXT_PUBLIC_SENTRY_WEBAPP_DSN }}

View file

@ -2,7 +2,7 @@ name: Deploy Demo
on:
push:
tags: ["v*.*.*"]
branches: ["main"]
workflow_dispatch:
jobs:

View file

@ -73,12 +73,13 @@ jobs:
with:
context: .
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha,scope=${{ env.PLATFORM_PAIR }}
cache-to: type=gha,mode=max,scope=${{ env.PLATFORM_PAIR }}
cache-from: type=gha
cache-to: type=gha,mode=max
platforms: ${{ matrix.platform }}
outputs: type=image,name=${{ env.REGISTRY_IMAGE }},push-by-digest=true,name-canonical=true,push=true,annotation.org.opencontainers.image.description=Blazingly fast code search
build-args: |
NEXT_PUBLIC_SOURCEBOT_VERSION=${{ github.ref_name }}
NEXT_PUBLIC_POSTHOG_PAPIK=${{ vars.NEXT_PUBLIC_POSTHOG_PAPIK }}
- name: Export digest
run: |

View file

@ -1,9 +1,8 @@
name: Update Roadmap Released
on:
push:
branches:
- main
pull_request:
types: [closed]
workflow_dispatch:
schedule:
- cron: "0 */6 * * *"

View file

@ -5,9 +5,6 @@
},
{
"path": "../vendor/zoekt"
},
{
"path": "../../sourcebot-helm-chart"
}
],
"settings": {

View file

@ -7,145 +7,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Fixed
- Fixed issue where parenthesis in query params were not being encoded, resulting in a poor experience when embedding links in Markdown. [#674](https://github.com/sourcebot-dev/sourcebot/pull/674)
## [4.10.3] - 2025-12-12
### Fixed
- Fixed review agent so that it works with GHES instances [#611](https://github.com/sourcebot-dev/sourcebot/pull/611)
- Updated next package version to fix CVE-2025-55184 and CVE-2025-55183. [#673](https://github.com/sourcebot-dev/sourcebot/pull/673)
### Added
- Added support for arbitrary user IDs required for OpenShift. [#658](https://github.com/sourcebot-dev/sourcebot/pull/658)
### Updated
- Improved error messages in file source api. [#665](https://github.com/sourcebot-dev/sourcebot/pull/665)
## [4.10.2] - 2025-12-04
### Fixed
- Fixed issue where the disable telemetry flag was not being respected for web server telemetry. [#657](https://github.com/sourcebot-dev/sourcebot/pull/657)
## [4.10.1] - 2025-12-03
### Added
- Added `ALWAYS_INDEX_FILE_PATTERNS` environment variable to allow specifying a comma seperated list of glob patterns matching file paths that should always be indexed, regardless of size or # of trigrams. [#631](https://github.com/sourcebot-dev/sourcebot/pull/631)
- Added button to explore menu to toggle cross-repository search. [#647](https://github.com/sourcebot-dev/sourcebot/pull/647)
- Added server side telemetry for search metrics. [#652](https://github.com/sourcebot-dev/sourcebot/pull/652)
### Fixed
- Fixed issue where single quotes could not be used in search queries. [#629](https://github.com/sourcebot-dev/sourcebot/pull/629)
- Fixed issue where files with special characters would fail to load. [#636](https://github.com/sourcebot-dev/sourcebot/issues/636)
- Fixed Ask performance issues. [#632](https://github.com/sourcebot-dev/sourcebot/pull/632)
- Fixed regression where creating a new Ask thread when unauthenticated would result in a 404. [#641](https://github.com/sourcebot-dev/sourcebot/pull/641)
- Updated react and next package versions to fix CVE 2025-55182. [#654](https://github.com/sourcebot-dev/sourcebot/pull/654)
### Changed
- Changed the default behaviour for code nav to scope references & definitions search to the current repository. [#647](https://github.com/sourcebot-dev/sourcebot/pull/647)
## [4.10.0] - 2025-11-24
### Added
- Added support for streaming code search results. [#623](https://github.com/sourcebot-dev/sourcebot/pull/623)
- Added buttons to toggle case sensitivity and regex patterns. [#623](https://github.com/sourcebot-dev/sourcebot/pull/623)
- Added counts to members, requets, and invites tabs in the members settings. [#621](https://github.com/sourcebot-dev/sourcebot/pull/621)
- [Sourcebot EE] Add support for Authentik as a identity provider. [#627](https://github.com/sourcebot-dev/sourcebot/pull/627)
### Changed
- Changed the default search behaviour to match patterns as substrings and **not** regular expressions. Regular expressions can be used by toggling the regex button in search bar. [#623](https://github.com/sourcebot-dev/sourcebot/pull/623)
- Renamed `public` query prefix to `visibility`. Allowed values for `visibility` are `public`, `private`, and `any`. [#623](https://github.com/sourcebot-dev/sourcebot/pull/623)
- Changed `archived` query prefix to accept values `yes`, `no`, and `only`. [#623](https://github.com/sourcebot-dev/sourcebot/pull/623)
- Changed `lang` query prefix to be case sensitive. [#623](https://github.com/sourcebot-dev/sourcebot/pull/623)
### Removed
- Removed `case` query prefix. [#623](https://github.com/sourcebot-dev/sourcebot/pull/623)
- Removed `branch` and `b` query prefixes. Please use `rev:` instead. [#623](https://github.com/sourcebot-dev/sourcebot/pull/623)
- Removed `regex` query prefix. [#623](https://github.com/sourcebot-dev/sourcebot/pull/623)
### Fixed
- Fixed spurious infinite loads with explore panel, file tree, and file search command. [#617](https://github.com/sourcebot-dev/sourcebot/pull/617)
- Wipe search context on init if entitlement no longer exists [#618](https://github.com/sourcebot-dev/sourcebot/pull/618)
- Fixed Bitbucket repository exclusions not supporting glob patterns. [#620](https://github.com/sourcebot-dev/sourcebot/pull/620)
- Fixed issue where the repo driven permission syncer was attempting to sync public repositories. [#624](https://github.com/sourcebot-dev/sourcebot/pull/624)
- Fixed issue where worker would not shutdown while a permission sync job (repo or user) was in progress. [#624](https://github.com/sourcebot-dev/sourcebot/pull/624)
## [4.9.2] - 2025-11-13
### Changed
- Bumped the default requested search result count from 5k to 10k after optimization pass. [#615](https://github.com/sourcebot-dev/sourcebot/pull/615)
### Fixed
- Fixed incorrect shutdown of PostHog SDK in the worker. [#609](https://github.com/sourcebot-dev/sourcebot/pull/609)
- Fixed race condition in job schedulers. [#607](https://github.com/sourcebot-dev/sourcebot/pull/607)
- Fixed connection sync jobs getting stuck in pending or in progress after restarting the worker. [#612](https://github.com/sourcebot-dev/sourcebot/pull/612)
- Fixed issue where connections would always sync on startup, regardless if they changed or not. [#613](https://github.com/sourcebot-dev/sourcebot/pull/613)
- Fixed performance bottleneck in search api. Result is a order of magnitutde improvement to average search time according to benchmarks. [#615](https://github.com/sourcebot-dev/sourcebot/pull/615)
### Added
- Added force resync buttons for connections and repositories. [#610](https://github.com/sourcebot-dev/sourcebot/pull/610)
- Added environment variable to configure default search result count. [#616](https://github.com/sourcebot-dev/sourcebot/pull/616)
## [4.9.1] - 2025-11-07
### Added
- Added support for running Sourcebot as non-root user. [#599](https://github.com/sourcebot-dev/sourcebot/pull/599)
## [4.9.0] - 2025-11-04
### Added
- [Experimental][Sourcebot EE] Added GitLab permission syncing. [#585](https://github.com/sourcebot-dev/sourcebot/pull/585)
- [Sourcebot EE] Added external identity provider config and support for multiple accounts. [#595](https://github.com/sourcebot-dev/sourcebot/pull/595)
- Added ability to configure environment variables from the config. [#597](https://github.com/sourcebot-dev/sourcebot/pull/597)
### Fixed
- [ask sb] Fixed issue where reasoning tokens would appear in `text` content for openai compatible models. [#582](https://github.com/sourcebot-dev/sourcebot/pull/582)
- Fixed issue with GitHub app token tracking and refreshing. [#583](https://github.com/sourcebot-dev/sourcebot/pull/583)
- Fixed "The account is already associated with another user" errors with GitLab oauth provider. [#584](https://github.com/sourcebot-dev/sourcebot/pull/584)
- Fixed error when viewing a generic git connection in `/settings/connections`. [#588](https://github.com/sourcebot-dev/sourcebot/pull/588)
- Fixed issue with an unbounded `Promise.allSettled(...)` when retrieving details from the GitHub API about a large number of repositories (or orgs or users). [#591](https://github.com/sourcebot-dev/sourcebot/pull/591)
- Fixed resource exhaustion (EAGAIN errors) when syncing generic-git-host connections with thousands of repositories. [#593](https://github.com/sourcebot-dev/sourcebot/pull/593)
### Removed
- Removed built-in secret manager. [#592](https://github.com/sourcebot-dev/sourcebot/pull/592)
### Changed
- Changed internal representation of how repo permissions are represented in the database. [#600](https://github.com/sourcebot-dev/sourcebot/pull/600)
## [4.8.1] - 2025-10-29
### Fixed
- Fixed commit and branch hyperlinks not rendering for Gerrit repos. [#581](https://github.com/sourcebot-dev/sourcebot/pull/581)
- Fixed visual bug when a repository does not have a image. [#581](https://github.com/sourcebot-dev/sourcebot/pull/581)
- Fixed issue where the Ask homepage was not scrollable. [#581](https://github.com/sourcebot-dev/sourcebot/pull/581)
## [4.8.0] - 2025-10-28
### Added
- Implement dynamic tab titles for files and folders in browse tab. [#560](https://github.com/sourcebot-dev/sourcebot/pull/560)
- Added support for passing db connection url as seperate `DATABASE_HOST`, `DATABASE_USERNAME`, `DATABASE_PASSWORD`, `DATABASE_NAME`, and `DATABASE_ARGS` env vars. [#545](https://github.com/sourcebot-dev/sourcebot/pull/545)
- Added support for GitHub Apps for service auth. [#570](https://github.com/sourcebot-dev/sourcebot/pull/570)
- Added prometheus metrics for repo index manager. [#571](https://github.com/sourcebot-dev/sourcebot/pull/571)
- Added experimental environment variable to disable API key creation for non-admin users. [#577](https://github.com/sourcebot-dev/sourcebot/pull/577)
- [Experimental][Sourcebot EE] Added REST API to get users and delete a user. [#578](https://github.com/sourcebot-dev/sourcebot/pull/578)
### Fixed
- Fixed "dubious ownership" errors when cloning / fetching repos. [#553](https://github.com/sourcebot-dev/sourcebot/pull/553)
- Fixed issue with Ask Sourcebot tutorial re-appearing after restarting the browser. [#563](https://github.com/sourcebot-dev/sourcebot/pull/563)
- Fixed `repoIndexTimeoutMs` not being used for index job timeouts. [#567](https://github.com/sourcebot-dev/sourcebot/pull/567)
### Changed
- Improved search performance for unbounded search queries. [#555](https://github.com/sourcebot-dev/sourcebot/pull/555)
- Improved homepage performance by removing client side polling. [#563](https://github.com/sourcebot-dev/sourcebot/pull/563)
- Changed navbar indexing indicator to only report progress for first time indexing jobs. [#563](https://github.com/sourcebot-dev/sourcebot/pull/563)
- Improved repo indexing job stability and robustness. [#563](https://github.com/sourcebot-dev/sourcebot/pull/563)
- Improved repositories table. [#572](https://github.com/sourcebot-dev/sourcebot/pull/572)
- Improved connections table. [#579](https://github.com/sourcebot-dev/sourcebot/pull/579)
### Removed
- Removed spam "login page loaded" log. [#552](https://github.com/sourcebot-dev/sourcebot/pull/552)
- Removed connections management page. [#563](https://github.com/sourcebot-dev/sourcebot/pull/563)
## [4.7.3] - 2025-09-29
### Fixed

View file

@ -36,20 +36,15 @@
docker compose -f docker-compose-dev.yml up -d
```
6. Generate the database schema.
```sh
yarn dev:prisma:migrate:dev
```
6. Create a copy of `.env.development` and name it `.env.development.local`. Update the required environment variables.
7. Create a copy of `.env.development` and name it `.env.development.local`. Update the required environment variables.
7. If you're using a declarative configuration file, create a configuration file and update the `CONFIG_PATH` environment variable in your `.env.development.local` file.
8. If you're using a declarative configuration file, create a configuration file and update the `CONFIG_PATH` environment variable in your `.env.development.local` file.
9. Start Sourcebot with the command:
8. Start Sourcebot with the command:
```sh
yarn dev
```
A `.sourcebot` directory will be created and zoekt will begin to index the repositories found in the `config.json` file.
10. Start searching at `http://localhost:3000`.
9. Start searching at `http://localhost:3000`.

View file

@ -1,4 +1,3 @@
# syntax=docker/dockerfile:1
# ------ Global scope variables ------
# Set of global build arguments.
@ -9,6 +8,11 @@
# @see: https://docs.docker.com/build/building/variables/#scoping
ARG NEXT_PUBLIC_SOURCEBOT_VERSION
# PAPIK = Project API Key
# Note that this key does not need to be kept secret, so it's not
# necessary to use Docker build secrets here.
# @see: https://posthog.com/tutorials/api-capture-events#authenticating-with-the-project-api-key
ARG NEXT_PUBLIC_POSTHOG_PAPIK
ARG NEXT_PUBLIC_SENTRY_ENVIRONMENT
ARG NEXT_PUBLIC_SOURCEBOT_CLOUD_ENVIRONMENT
ARG NEXT_PUBLIC_SENTRY_WEBAPP_DSN
@ -38,13 +42,17 @@ COPY package.json yarn.lock* .yarnrc.yml ./
COPY .yarn ./.yarn
COPY ./packages/db ./packages/db
COPY ./packages/schemas ./packages/schemas
COPY ./packages/crypto ./packages/crypto
COPY ./packages/error ./packages/error
COPY ./packages/logger ./packages/logger
COPY ./packages/shared ./packages/shared
COPY ./packages/queryLanguage ./packages/queryLanguage
RUN yarn workspace @sourcebot/db install
RUN yarn workspace @sourcebot/schemas install
RUN yarn workspace @sourcebot/crypto install
RUN yarn workspace @sourcebot/error install
RUN yarn workspace @sourcebot/logger install
RUN yarn workspace @sourcebot/shared install
RUN yarn workspace @sourcebot/query-language install
# ------------------------------------
# ------ Build Web ------
@ -53,6 +61,8 @@ ENV SKIP_ENV_VALIDATION=1
# -----------
ARG NEXT_PUBLIC_SOURCEBOT_VERSION
ENV NEXT_PUBLIC_SOURCEBOT_VERSION=$NEXT_PUBLIC_SOURCEBOT_VERSION
ARG NEXT_PUBLIC_POSTHOG_PAPIK
ENV NEXT_PUBLIC_POSTHOG_PAPIK=$NEXT_PUBLIC_POSTHOG_PAPIK
ARG NEXT_PUBLIC_SENTRY_ENVIRONMENT
ENV NEXT_PUBLIC_SENTRY_ENVIRONMENT=$NEXT_PUBLIC_SENTRY_ENVIRONMENT
ARG NEXT_PUBLIC_SOURCEBOT_CLOUD_ENVIRONMENT
@ -87,8 +97,10 @@ COPY ./packages/web ./packages/web
COPY --from=shared-libs-builder /app/node_modules ./node_modules
COPY --from=shared-libs-builder /app/packages/db ./packages/db
COPY --from=shared-libs-builder /app/packages/schemas ./packages/schemas
COPY --from=shared-libs-builder /app/packages/crypto ./packages/crypto
COPY --from=shared-libs-builder /app/packages/error ./packages/error
COPY --from=shared-libs-builder /app/packages/logger ./packages/logger
COPY --from=shared-libs-builder /app/packages/shared ./packages/shared
COPY --from=shared-libs-builder /app/packages/queryLanguage ./packages/queryLanguage
# Fixes arm64 timeouts
RUN yarn workspace @sourcebot/web install
@ -126,8 +138,10 @@ COPY ./packages/backend ./packages/backend
COPY --from=shared-libs-builder /app/node_modules ./node_modules
COPY --from=shared-libs-builder /app/packages/db ./packages/db
COPY --from=shared-libs-builder /app/packages/schemas ./packages/schemas
COPY --from=shared-libs-builder /app/packages/crypto ./packages/crypto
COPY --from=shared-libs-builder /app/packages/error ./packages/error
COPY --from=shared-libs-builder /app/packages/logger ./packages/logger
COPY --from=shared-libs-builder /app/packages/shared ./packages/shared
COPY --from=shared-libs-builder /app/packages/queryLanguage ./packages/queryLanguage
RUN yarn workspace @sourcebot/backend install
RUN yarn workspace @sourcebot/backend build
@ -142,12 +156,14 @@ fi
ENV SKIP_ENV_VALIDATION=0
# ------------------------------
# ------ Runner ------
FROM node-alpine AS runner
# -----------
ARG NEXT_PUBLIC_SOURCEBOT_VERSION
ENV NEXT_PUBLIC_SOURCEBOT_VERSION=$NEXT_PUBLIC_SOURCEBOT_VERSION
ARG NEXT_PUBLIC_POSTHOG_PAPIK
ENV NEXT_PUBLIC_POSTHOG_PAPIK=$NEXT_PUBLIC_POSTHOG_PAPIK
ARG NEXT_PUBLIC_SENTRY_ENVIRONMENT
ENV NEXT_PUBLIC_SENTRY_ENVIRONMENT=$NEXT_PUBLIC_SENTRY_ENVIRONMENT
ARG NEXT_PUBLIC_SENTRY_WEBAPP_DSN
@ -169,13 +185,10 @@ ENV DATA_DIR=/data
ENV DATA_CACHE_DIR=$DATA_DIR/.sourcebot
ENV DATABASE_DATA_DIR=$DATA_CACHE_DIR/db
ENV REDIS_DATA_DIR=$DATA_CACHE_DIR/redis
ENV DATABASE_URL="postgresql://postgres@localhost:5432/sourcebot"
ENV REDIS_URL="redis://localhost:6379"
ENV SRC_TENANT_ENFORCEMENT_MODE=strict
ENV SOURCEBOT_PUBLIC_KEY_PATH=/app/public.pem
# PAPIK = Project API Key
# Note that this key does not need to be kept secret, so it's not
# necessary to use Docker build secrets here.
# @see: https://posthog.com/tutorials/api-capture-events#authenticating-with-the-project-api-key
# @note: this is also declared in the shared env.server.ts file.
ENV POSTHOG_PAPIK=phc_lLPuFFi5LH6c94eFJcqvYVFwiJffVcV6HD8U4a1OnRW
# Valid values are: debug, info, warn, error
ENV SOURCEBOT_LOG_LEVEL=info
@ -183,23 +196,6 @@ ENV SOURCEBOT_LOG_LEVEL=info
# Sourcebot collects anonymous usage data using [PostHog](https://posthog.com/). Uncomment this line to disable.
# ENV SOURCEBOT_TELEMETRY_DISABLED=1
# Configure dependencies
RUN apk add --no-cache git ca-certificates bind-tools tini jansson wget supervisor uuidgen curl perl jq redis postgresql postgresql-contrib openssl util-linux unzip
ARG UID=1500
ARG GID=1500
# Always create the non-root user to support runtime user switching
# The container can be run as root (default) or as sourcebot user using docker run --user
RUN addgroup -g $GID sourcebot && \
adduser -D -u $UID -h /app -S sourcebot && \
adduser sourcebot postgres && \
adduser sourcebot redis && \
chown -R sourcebot /app && \
adduser sourcebot node && \
mkdir /var/log/sourcebot && \
chown sourcebot /var/log/sourcebot
COPY package.json yarn.lock* .yarnrc.yml public.pem ./
COPY .yarn ./.yarn
@ -219,48 +215,37 @@ COPY --from=zoekt-builder \
/cmd/zoekt-index \
/usr/local/bin/
RUN chown -R sourcebot:sourcebot /app
# Copy zoekt proto files (needed for gRPC client at runtime)
COPY --chown=sourcebot:sourcebot vendor/zoekt/grpc/protos /app/vendor/zoekt/grpc/protos
# Copy all of the things
COPY --chown=sourcebot:sourcebot --from=web-builder /app/packages/web/public ./packages/web/public
COPY --chown=sourcebot:sourcebot --from=web-builder /app/packages/web/.next/standalone ./
COPY --chown=sourcebot:sourcebot --from=web-builder /app/packages/web/.next/static ./packages/web/.next/static
COPY --from=web-builder /app/packages/web/public ./packages/web/public
COPY --from=web-builder /app/packages/web/.next/standalone ./
COPY --from=web-builder /app/packages/web/.next/static ./packages/web/.next/static
COPY --chown=sourcebot:sourcebot --from=backend-builder /app/node_modules ./node_modules
COPY --chown=sourcebot:sourcebot --from=backend-builder /app/packages/backend ./packages/backend
COPY --from=backend-builder /app/node_modules ./node_modules
COPY --from=backend-builder /app/packages/backend ./packages/backend
COPY --chown=sourcebot:sourcebot --from=shared-libs-builder /app/packages/db ./packages/db
COPY --chown=sourcebot:sourcebot --from=shared-libs-builder /app/packages/schemas ./packages/schemas
COPY --chown=sourcebot:sourcebot --from=shared-libs-builder /app/packages/shared ./packages/shared
COPY --chown=sourcebot:sourcebot --from=shared-libs-builder /app/packages/queryLanguage ./packages/queryLanguage
COPY --from=shared-libs-builder /app/node_modules ./node_modules
COPY --from=shared-libs-builder /app/packages/db ./packages/db
COPY --from=shared-libs-builder /app/packages/schemas ./packages/schemas
COPY --from=shared-libs-builder /app/packages/crypto ./packages/crypto
COPY --from=shared-libs-builder /app/packages/error ./packages/error
COPY --from=shared-libs-builder /app/packages/logger ./packages/logger
COPY --from=shared-libs-builder /app/packages/shared ./packages/shared
# Fixes git "dubious ownership" issues when the volume is mounted with different permissions to the container.
RUN git config --global safe.directory "*"
# Configure dependencies
RUN apk add --no-cache git ca-certificates bind-tools tini jansson wget supervisor uuidgen curl perl jq redis postgresql postgresql-contrib openssl util-linux unzip
# Configure the database
RUN mkdir -p /run/postgresql && \
chown -R postgres:postgres /run/postgresql && \
chmod 775 /run/postgresql
# Make app directory accessible to both root and sourcebot user
RUN chown -R sourcebot /app \
&& chgrp -R 0 /app \
&& chmod -R g=u /app
# Make data directory accessible to both root and sourcebot user
RUN chown -R sourcebot /data
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
COPY prefix-output.sh ./prefix-output.sh
RUN chmod +x ./prefix-output.sh
COPY entrypoint.sh ./entrypoint.sh
RUN chmod +x ./entrypoint.sh
# Note: for back-compat cases, we do _not_ set the USER directive here.
# Instead, the user can be overridden at runtime with --user flag.
# USER sourcebot
COPY default-config.json .
EXPOSE 3000
ENV PORT=3000

View file

@ -2,7 +2,7 @@ Copyright (c) 2025 Taqla Inc.
Portions of this software are licensed as follows:
- All content located within any folder or subfolder named “ee” in this repository is licensed under the terms specified in “ee/LICENSE”,
- All content that resides under the "ee/", "packages/web/src/ee/", "packages/backend/src/ee/", and "packages/shared/src/ee/" directories of this repository, if these directories exist, is licensed under the license defined in "ee/LICENSE".
- All third party components incorporated into the Sourcebot Software are licensed under the original license provided by the owner of the applicable component.
- Content outside of the above mentioned directories or restrictions above is available under the "Functional Source License" as defined below.

View file

@ -28,6 +28,10 @@ clean:
packages/db/dist \
packages/schemas/node_modules \
packages/schemas/dist \
packages/crypto/node_modules \
packages/crypto/dist \
packages/error/node_modules \
packages/error/dist \
packages/mcp/node_modules \
packages/mcp/dist \
packages/shared/node_modules \

View file

@ -72,22 +72,15 @@ https://github.com/user-attachments/assets/31ec0669-707d-4e03-b511-1bc33d44197a
# Deploy Sourcebot
Sourcebot can be deployed in seconds using Docker Compose. Visit our [docs](https://docs.sourcebot.dev/docs/deployment/docker-compose) for more information.
Sourcebot can be deployed in seconds using our official docker image. Visit our [docs](https://docs.sourcebot.dev/docs/deployment-guide) for more information.
1. Download the docker-compose.yml file
```sh
curl -o docker-compose.yml https://raw.githubusercontent.com/sourcebot-dev/sourcebot/main/docker-compose.yml
```
2. In the same directory as the `docker-compose.yml` file, create a [configuration file](https://docs.sourcebot.dev/docs/configuration/config-file). The configuration file is a JSON file that configures Sourcebot's behaviour, including what repositories to index, language model providers, auth providers, and more.
1. Create a config
```sh
touch config.json
echo '{
"$schema": "https://raw.githubusercontent.com/sourcebot-dev/sourcebot/main/schemas/v3/index.json",
// Comments are supported.
// This config creates a single connection to GitHub.com that
// indexes the Sourcebot repository
"connections": {
// Comments are supported
"starter-connection": {
"type": "github",
"repos": [
@ -98,22 +91,41 @@ echo '{
}' > config.json
```
3. Update the secrets in the `docker-compose.yml` and then run Sourcebot using:
2. Run the docker container
```sh
docker compose up
docker run \
-p 3000:3000 \
--pull=always \
--rm \
-v $(pwd):/data \
-e CONFIG_PATH=/data/config.json \
--name sourcebot \
ghcr.io/sourcebot-dev/sourcebot:latest
```
<details>
<summary>What does this command do?</summary>
4. Visit `http://localhost:3000` to start using Sourcebot
- Pull and run the Sourcebot docker image from [ghcr.io/sourcebot-dev/sourcebot:latest](https://github.com/sourcebot-dev/sourcebot/pkgs/container/sourcebot).
- Mount the current directory (`-v $(pwd):/data`) to allow Sourcebot to persist the `.sourcebot` cache.
- Clones sourcebot at `HEAD` into `.sourcebot/github/sourcebot-dev/sourcebot`.
- Indexes sourcebot into a .zoekt index file in `.sourcebot/index/`.
- Map port 3000 between your machine and the docker image.
- Starts the web server on port 3000.
</details>
</br>
3. Visit `http://localhost:3000` to start using Sourcebot
</br>
To configure Sourcebot (index your own repos, connect your LLMs, etc), check out our [docs](https://docs.sourcebot.dev/docs/configuration/config-file).
> [!NOTE]
> Sourcebot collects <a href="https://demo.sourcebot.dev/~/search?query=captureEvent%5C(%20repo%3Asourcebot">anonymous usage data</a> by default to help us improve the product. No sensitive data is collected, but if you'd like to disable this you can do so by setting the `SOURCEBOT_TELEMETRY_DISABLED` environment
> variable to `true`. Please refer to our [telemetry docs](https://docs.sourcebot.dev/docs/overview#telemetry) for more information.
> variable to `true`. Please refer to our [telemetry docs](https://docs.sourcebot.dev/self-hosting/overview#telemetry) for more information.
# Build from source
>[!NOTE]
> Building from source is only required if you'd like to contribute. If you'd just like to use Sourcebot, we recommend checking out our self-hosting [docs](https://docs.sourcebot.dev/self-hosting/overview).
If you'd like to build from source, please checkout the `CONTRIBUTING.md` file for more information.

11
default-config.json Normal file
View file

@ -0,0 +1,11 @@
{
"$schema": "./schemas/v2/index.json",
"repos": [
{
"type": "github",
"repos": [
"sourcebot-dev/sourcebot"
]
}
]
}

243
demo-site-config.json Normal file
View file

@ -0,0 +1,243 @@
// This is the config file for https://demo.sourcebot.dev.
// To add a new repository, edit this file and open a PR.
// After the PR is merged, the deploy demo workflow will
// run (see: https://github.com/sourcebot-dev/sourcebot/actions/workflows/deploy-demo.yml),
// after which the changes will be reflected on the demo site.
{
"$schema": "https://raw.githubusercontent.com/sourcebot-dev/sourcebot/main/schemas/v3/index.json",
"connections": {
// Defines the GitHub repositories.
// See: https://docs.sourcebot.dev/docs/connections/github
"github-repos": {
"type": "github",
"token": {
"env": "GITHUB_TOKEN"
},
"repos": [
"torvalds/linux",
"pytorch/pytorch",
"commaai/openpilot",
"ggerganov/whisper.cpp",
"ggerganov/llama.cpp",
"codemirror/dev",
"tailwindlabs/tailwindcss",
"sourcebot-dev/sourcebot",
"sindresorhus/awesome",
"facebook/react",
"vinta/awesome-python",
"vuejs/vue",
"TheAlgorithms/Python",
"tensorflow/tensorflow",
"twbs/bootstrap",
"flutter/flutter",
"microsoft/vscode",
"github/gitignore",
"airbnb/javascript",
"AUTOMATIC1111/stable-diffusion-webui",
"huggingface/transformers",
"avelino/awesome-go",
"ytdl-org/youtube-dl",
"vercel/next.js",
"golang/go",
"facebook/react-native",
"electron/electron",
"Genymobile/scrcpy",
"f/awesome-chatgpt-prompts",
"microsoft/PowerToys",
"kubernetes/kubernetes",
"d3/d3",
"nodejs/node",
"massgravel/Microsoft-Activation-Scripts",
"axios/axios",
"mrdoob/three.js",
"krahets/hello-algo",
"facebook/create-react-app",
"ollama/ollama",
"microsoft/TypeScript",
"goldbergyoni/nodebestpractices",
"rust-lang/rust",
"denoland/deno",
"angular/angular",
"langchain-ai/langchain",
"microsoft/terminal",
"521xueweihan/HelloGitHub",
"mui/material-ui",
"ant-design/ant-design",
"yt-dlp/yt-dlp",
"puppeteer/puppeteer",
"papers-we-love/papers-we-love",
"iptv-org/iptv",
"fatedier/frp",
"excalidraw/excalidraw",
"tauri-apps/tauri",
"neovim/neovim",
"django/django",
"florinpop17/app-ideas",
"animate-css/animate.css",
"nvm-sh/nvm",
"gothinkster/realworld",
"bitcoin/bitcoin",
"sveltejs/svelte",
"opencv/opencv",
"gin-gonic/gin",
"laravel/laravel",
"fastapi/fastapi",
"macrozheng/mall",
"jaywcjlove/awesome-mac",
"tonsky/FiraCode",
"rustdesk/rustdesk",
"tensorflow/models",
"doocs/advanced-java",
"shadcn-ui/ui",
"gohugoio/hugo",
"spring-projects/spring-boot",
"supabase/supabase",
"oven-sh/bun",
"FortAwesome/Font-Awesome",
"home-assistant/core",
"typicode/json-server",
"mermaid-js/mermaid",
"openai/whisper",
"netdata/netdata",
"vuejs/awesome-vue",
"3b1b/manim",
"2dust/v2rayN",
"nomic-ai/gpt4all",
"elastic/elasticsearch",
"fighting41love/funNLP",
"vitejs/vite",
"coder/code-server",
"moby/moby",
"CompVis/stable-diffusion",
"base-org/node",
"nestjs/nest",
"pallets/flask",
"hakimel/reveal.js",
"microsoft/playwright",
"swiftlang/swift",
"redis/redis",
"bregman-arie/devops-exercises",
"binary-husky/gpt_academic",
"junegunn/fzf",
"syncthing/syncthing",
"hoppscotch/hoppscotch",
"protocolbuffers/protobuf",
"enaqx/awesome-react",
"expressjs/express",
"microsoft/generative-ai-for-beginners",
"grafana/grafana",
"abi/screenshot-to-code",
"chartjs/Chart.js",
"webpack/webpack",
"d2l-ai/d2l-zh",
"strapi/strapi",
"python/cpython",
"leonardomso/33-js-concepts",
"kdn251/interviews",
"ventoy/Ventoy",
"ansible/ansible",
"apache/superset",
"tesseract-ocr/tesseract",
"lydiahallie/javascript-questions",
"FuelLabs/sway",
"keras-team/keras",
"resume/resume.github.com",
"swisskyrepo/PayloadsAllTheThings",
"ocornut/imgui",
"socketio/socket.io",
"awesomedata/awesome-public-datasets",
"louislam/uptime-kuma",
"kelseyhightower/nocode",
"sherlock-project/sherlock",
"reduxjs/redux",
"apache/echarts",
"obsproject/obs-studio",
"openai/openai-cookbook",
"fffaraz/awesome-cpp",
"scikit-learn/scikit-learn",
"TheAlgorithms/Java",
"atom/atom",
"Eugeny/tabby",
"lodash/lodash",
"caddyserver/caddy",
"sindresorhus/awesome-nodejs",
"rust-unofficial/awesome-rust",
"streamich/react-use",
"pocketbase/pocketbase",
"lllyasviel/Fooocus",
"k88hudson/git-flight-rules",
"react-hook-form/react-hook-form",
"koajs/koa",
"SheetJS/sheetjs",
"trpc/trpc",
"LC044/WeChatMsg",
"airbnb/lottie-android",
"huihut/interview",
"jgm/pandoc",
"google/googletest",
"date-fns/date-fns",
"nativefier/nativefier",
"openai/gym",
"files-community/Files",
"sahat/hackathon-starter",
"appsmithorg/appsmith",
"ultralytics/ultralytics",
"slidevjs/slidev",
"xitu/gold-miner",
"sorrycc/awesome-javascript",
"astral-sh/ruff",
"logseq/logseq",
"shadowsocks/shadowsocks",
"ccxt/ccxt",
"netty/netty",
"tw93/Pake",
"fxsjy/jieba",
"atlassian/react-beautiful-dnd",
"ToolJet/ToolJet",
"markedjs/marked",
"typicode/husky",
"laravel/framework",
"TheAlgorithms/JavaScript",
"bilibili/ijkplayer",
"solidjs/solid",
"fastify/fastify",
"huggingface/pytorch-image-models",
"shadowsocks/ShadowsocksX-NG",
"carbon-language/carbon-lang",
"s0md3v/roop",
"ascoders/weekly",
"backstage/backstage",
"servo/servo",
"composer/composer",
"tastejs/todomvc",
"lutzroeder/netron",
"alibaba/canal",
"tinygrad/tinygrad",
"ManimCommunity/manim",
"filebrowser/filebrowser",
"nicolargo/glances",
"iperov/DeepFaceLive",
"StevenBlack/hosts",
"crossoverJie/JCSprout",
"mantinedev/mantine",
"Automattic/mongoose",
"eslint/eslint",
"nextauthjs/next-auth",
"flameshot-org/flameshot",
"envoyproxy/envoy",
"sourcebot-dev/zoekt"
]
},
// Defines the GitLab repositories.
// See: https://docs.sourcebot.dev/docs/connections/gitlab
"gitlab-repos": {
"type": "gitlab",
"projects": [
"gnachman/iterm2"
]
}
},
"settings": {
"reindexIntervalMs": 86400000 // 24 hours
}
}

View file

@ -1,65 +0,0 @@
services:
sourcebot:
image: ghcr.io/sourcebot-dev/sourcebot:latest
user: sourcebot
restart: always
container_name: sourcebot
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
ports:
- "3000:3000"
volumes:
- ./config.json:/data/config.json
- sourcebot_data:/data
environment:
- CONFIG_PATH=/data/config.json
- AUTH_URL=${AUTH_URL:-http://localhost:3000}
- AUTH_SECRET=${AUTH_SECRET:-000000000000000000000000000000000} # CHANGEME: generate via `openssl rand -base64 33`
- SOURCEBOT_ENCRYPTION_KEY=${SOURCEBOT_ENCRYPTION_KEY:-000000000000000000000000000000000} # CHANGEME: generate via `openssl rand -base64 24`
- DATABASE_URL=${DATABASE_URL:-postgresql://postgres:postgres@postgres:5432/postgres} # CHANGEME
- REDIS_URL=${REDIS_URL:-redis://redis:6379} # CHANGEME
- SOURCEBOT_EE_LICENSE_KEY=${SOURCEBOT_EE_LICENSE_KEY:-}
# For the full list of environment variables see:
# https://docs.sourcebot.dev/docs/configuration/environment-variables
postgres:
image: docker.io/postgres:${POSTGRES_VERSION:-latest}
restart: always
healthcheck:
test: ["CMD-SHELL", "pg_isready -U postgres"]
interval: 3s
timeout: 3s
retries: 10
environment:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres # CHANGEME
POSTGRES_DB: postgres
ports:
- 127.0.0.1:5432:5432
volumes:
- sourcebot_postgres_data:/var/lib/postgresql/data
redis:
image: docker.io/redis:${REDIS_VERSION:-latest}
restart: always
ports:
- 127.0.0.1:6379:6379
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 3s
timeout: 10s
retries: 10
volumes:
- sourcebot_redis_data:/data
volumes:
sourcebot_data:
driver: local
sourcebot_postgres_data:
driver: local
sourcebot_redis_data:
driver: local

View file

@ -21,13 +21,7 @@
"group": "Getting Started",
"pages": [
"docs/overview",
{
"group": "Deployment",
"pages": [
"docs/deployment/docker-compose",
"docs/deployment/k8s"
]
}
"docs/deployment-guide"
]
},
{
@ -85,7 +79,6 @@
]
},
"docs/configuration/language-model-providers",
"docs/configuration/idp",
{
"group": "Authentication",
"pages": [
@ -117,11 +110,6 @@
"href": "https://sourcebot.dev/changelog",
"icon": "list-check"
},
{
"anchor": "Roadmap",
"href": "https://github.com/sourcebot-dev/sourcebot/issues/459",
"icon": "map"
},
{
"anchor": "Support",
"href": "https://github.com/sourcebot-dev/sourcebot/issues/new?template=get_help.md",
@ -144,7 +132,7 @@
"socials": {
"github": "https://github.com/sourcebot-dev/sourcebot",
"twitter": "https://x.com/sourcebot_dev",
"discord": "https://discord.gg/HDScTs3ptP",
"discord": "https://discord.gg/Y6b78RqM",
"linkedin": "https://www.linkedin.com/company/sourcebot"
}
},

View file

@ -10,7 +10,7 @@ Sourcebot's built-in authentication system gates your deployment, and allows adm
<Card horizontal title="Authentication providers" icon="lock" href="/docs/configuration/auth/providers">
Configure additional authentication providers for your deployment.
</Card>
<Card horizontal title="Access settings" icon="user" href="/docs/configuration/auth/access-settings">
<Card horizontal title="Inviting members" icon="user" href="/docs/configuration/auth/inviting-members">
Learn how to configure how members join your deployment.
</Card>
<Card horizontal title="Roles and permissions" icon="shield" href="/docs/configuration/auth/roles-and-permissions">
@ -25,4 +25,4 @@ Sourcebot's built-in authentication system gates your deployment, and allows adm
# Troubleshooting
- If you experience issues logging in, logging out, or accessing an organization you should have access to, try clearing your cookies & performing a full page refresh (`Cmd/Ctrl + Shift + R` on most browsers).
- Still not working? Reach out to us on our [discord](https://discord.gg/HDScTs3ptP) or [GitHub](https://github.com/sourcebot-dev/sourcebot/issues/new/choose)
- Still not working? Reach out to us on our [discord](https://discord.com/invite/6Fhp27x7Pb) or [GitHub](https://github.com/sourcebot-dev/sourcebot/issues/new/choose)

View file

@ -26,5 +26,80 @@ See [transactional emails](/docs/configuration/transactional-emails) for more de
# Enterprise Authentication Providers
Sourcebot supports authentication using several different [external identity providers](/docs/configuration/idp) as well. These identity providers require an
[enterprise license](/docs/license-key)
The following authentication providers require an [enterprise license](/docs/license-key) to be enabled.
### GitHub
---
[Auth.js GitHub Provider Docs](https://authjs.dev/getting-started/providers/github)
**Required environment variables:**
- `AUTH_EE_GITHUB_CLIENT_ID`
- `AUTH_EE_GITHUB_CLIENT_SECRET`
Optional environment variables:
- `AUTH_EE_GITHUB_BASE_URL` - Base URL for GitHub Enterprise (defaults to https://github.com)
### GitLab
---
[Auth.js GitLab Provider Docs](https://authjs.dev/getting-started/providers/gitlab)
**Required environment variables:**
- `AUTH_EE_GITLAB_CLIENT_ID`
- `AUTH_EE_GITLAB_CLIENT_SECRET`
Optional environment variables:
- `AUTH_EE_GITLAB_BASE_URL` - Base URL for GitLab instance (defaults to https://gitlab.com)
### Google
---
[Auth.js Google Provider Docs](https://authjs.dev/getting-started/providers/google)
**Required environment variables:**
- `AUTH_EE_GOOGLE_CLIENT_ID`
- `AUTH_EE_GOOGLE_CLIENT_SECRET`
### GCP IAP
---
<Note>If you're running Sourcebot in an environment that blocks egress, make sure you allow the [IAP IP ranges](https://www.gstatic.com/ipranges/goog.json)</Note>
Custom provider built to enable automatic Sourcebot account registration/login when using GCP IAP.
**Required environment variables**
- `AUTH_EE_GCP_IAP_ENABLED`
- `AUTH_EE_GCP_IAP_AUDIENCE`
- This can be found by selecting the ⋮ icon next to the IAP-enabled backend service and pressing `Get JWT audience code`
### Okta
---
[Auth.js Okta Provider Docs](https://authjs.dev/getting-started/providers/okta)
**Required environment variables:**
- `AUTH_EE_OKTA_CLIENT_ID`
- `AUTH_EE_OKTA_CLIENT_SECRET`
- `AUTH_EE_OKTA_ISSUER`
### Keycloak
---
[Auth.js Keycloak Provider Docs](https://authjs.dev/getting-started/providers/keycloak)
**Required environment variables:**
- `AUTH_EE_KEYCLOAK_CLIENT_ID`
- `AUTH_EE_KEYCLOAK_CLIENT_SECRET`
- `AUTH_EE_KEYCLOAK_ISSUER`
### Microsoft Entra ID
[Auth.js Microsoft Entra ID Provider Docs](https://authjs.dev/getting-started/providers/microsoft-entra-id)
**Required environment variables:**
- `AUTH_EE_MICROSOFT_ENTRA_ID_CLIENT_ID`
- `AUTH_EE_MICROSOFT_ENTRA_ID_CLIENT_SECRET`
- `AUTH_EE_MICROSOFT_ENTRA_ID_ISSUER`
---

View file

@ -3,9 +3,6 @@ title: Config File
sidebarTitle: Config file
---
import ConfigSchema from '/snippets/schemas/v3/index.schema.mdx'
import EnvironmentOverridesSchema from '/snippets/schemas/v3/environmentOverrides.schema.mdx'
When self-hosting Sourcebot, you **must** provide it a config file. This is done by defining a config file in a volume that's mounted to Sourcebot, and providing the path to this
file in the `CONFIG_PATH` environment variable. For example:
@ -52,103 +49,3 @@ The following are settings that can be provided in your config file to modify So
| `enablePublicAccess` **(deprecated)** | boolean | false | — | Use the `FORCE_ENABLE_ANONYMOUS_ACCESS` environment variable instead. |
| `experiment_repoDrivenPermissionSyncIntervalMs` | number | 24hours | 1 | Interval at which the repo permission syncer should run. |
| `experiment_userDrivenPermissionSyncIntervalMs` | number | 24hours | 1 | Interval at which the user permission syncer should run. |
# Tokens
Tokens are used to securely pass secrets to Sourcebot in a config file. They are used in various places, including connections, language model providers, auth providers, etc. Tokens can be passed as either environment variables or Google Cloud secrets:
<AccordionGroup>
<Accordion title="Environment Variables">
```json
{
"token": {
"env": "TOKEN_NAME"
}
}
```
</Accordion>
<Accordion title="Google Cloud Secrets">
```json
{
"token": {
"googleCloudSecret": "projects/<project-id>/secrets/<secret-name>/versions/<version-id>"
}
}
```
</Accordion>
</AccordionGroup>
# Overriding environment variables from the config
You can override / set environment variables from the config file by using the `environmentOverrides` property. Overrides can be of type `string`, `number`, `boolean`, or a [token](/docs/configuration/config-file#tokens). Tokens are useful when you want to configure a environment variable using a Google Cloud Secret or other supported secret management service.
<AccordionGroup>
<Accordion title="Token">
```jsonc
{
"environmentOverrides": {
"DATABASE_URL": {
"type": "token",
"value": {
"googleCloudSecret": "projects/<id>/secrets/postgres-connection-string/versions/latest"
}
},
"REDIS_URL": {
"type": "token",
"value": {
"googleCloudSecret": "projects/<id>/secrets/redis-connection-string/versions/latest"
}
}
},
}
```
</Accordion>
<Accordion title="String">
```jsonc
{
"environmentOverrides": {
"EMAIL_FROM_ADDRESS": {
"type": "string",
"value": "hello@sourcebot.dev"
}
}
}
```
</Accordion>
<Accordion title="Number">
```jsonc
{
"environmentOverrides": {
"SOURCEBOT_CHAT_MODEL_TEMPERATURE": {
"type": "number",
"value": 0.5
}
}
}
```
</Accordion>
<Accordion title="Boolean">
```jsonc
{
"environmentOverrides": {
"SOURCEBOT_TELEMETRY_DISABLED": {
"type": "boolean",
"value": false
}
}
}
```
</Accordion>
</AccordionGroup>
**Note:** Overrides are **not** set as system environment variables, and instead are resolved at runtime on startup and stored in memory.
<Accordion title="Schema reference">
[schemas/v3/environmentOverrides.json](https://github.com/sourcebot-dev/sourcebot/blob/main/schemas/v3/environmentOverrides.json)
<EnvironmentOverridesSchema />
</Accordion>

View file

@ -1,9 +1,10 @@
---
title: Environment variables
sidebarTitle: Environment variables
mode: "wide"
---
<Note>This page provides a detailed reference of all environment variables supported by Sourcebot. If you're just looking to get up and running, we recommend starting with the [deployment guides](/docs/deployment/docker-compose) instead.</Note>
<Note>This page provides a detailed reference of all environment variables supported by Sourcebot. If you're just looking to get up and running, we recommend starting with the [deployment guide](/docs/deployment-guide) instead.</Note>
### Core Environment Variables
The following environment variables allow you to configure your Sourcebot deployment.
@ -18,7 +19,7 @@ The following environment variables allow you to configure your Sourcebot deploy
| `DATA_CACHE_DIR` | `$DATA_DIR/.sourcebot` | <p>The root data directory in which all data written to disk by Sourcebot will be located.</p> |
| `DATA_DIR` | `/data` | <p>The directory within the container to store all persistent data. Typically, this directory will be volume mapped such that data is persisted across container restarts (e.g., `docker run -v $(pwd):/data`)</p> |
| `DATABASE_DATA_DIR` | `$DATA_CACHE_DIR/db` | <p>The data directory for the default Postgres database.</p> |
| `DATABASE_URL` | `postgresql://postgres@ localhost:5432/sourcebot` | <p>Connection string of your Postgres database. By default, a Postgres database is automatically provisioned at startup within the container.</p><p>If you'd like to use a non-default schema, you can provide it as a parameter in the database url.</p><p>You can also use `DATABASE_HOST`, `DATABASE_USERNAME`, `DATABASE_PASSWORD`, `DATABASE_NAME`, and `DATABASE_ARGS` to construct the database url.</p> |
| `DATABASE_URL` | `postgresql://postgres@ localhost:5432/sourcebot` | <p>Connection string of your Postgres database. By default, a Postgres database is automatically provisioned at startup within the container.</p><p>If you'd like to use a non-default schema, you can provide it as a parameter in the database url </p> |
| `EMAIL_FROM_ADDRESS` | `-` | <p>The email address that transactional emails will be sent from. See [this doc](/docs/configuration/transactional-emails) for more info.</p> |
| `FORCE_ENABLE_ANONYMOUS_ACCESS` | `false` | <p>When enabled, [anonymous access](/docs/configuration/auth/access-settings#anonymous-access) to the organization will always be enabled</p>
| `REDIS_DATA_DIR` | `$DATA_CACHE_DIR/redis` | <p>The data directory for the default Redis instance.</p> |
@ -27,6 +28,7 @@ The following environment variables allow you to configure your Sourcebot deploy
| `REDIS_REMOVE_ON_FAIL` | `100` | <p>Controls how many failed jobs are allowed to remain in Redis queues</p> |
| `REPO_SYNC_RETRY_BASE_SLEEP_SECONDS` | `60` | <p>The base sleep duration (in seconds) for exponential backoff when retrying repository sync operations that fail</p> |
| `GITLAB_CLIENT_QUERY_TIMEOUT_SECONDS` | `600` | <p>The timeout duration (in seconds) for GitLab client queries</p> |
| `SHARD_MAX_MATCH_COUNT` | `10000` | <p>The maximum shard count per query</p> |
| `SMTP_CONNECTION_URL` | `-` | <p>The url to the SMTP service used for sending transactional emails. See [this doc](/docs/configuration/transactional-emails) for more info.</p> |
| `SOURCEBOT_ENCRYPTION_KEY` | Automatically generated at startup if no value is provided. Generated using `openssl rand -base64 24` | <p>Used to encrypt connection secrets and generate API keys.</p> |
| `SOURCEBOT_PUBLIC_KEY_PATH` | `/app/public.pem` | <p>Sourcebot's public key that's used to verify encrypted license key signatures.</p> |
@ -34,8 +36,8 @@ The following environment variables allow you to configure your Sourcebot deploy
| `SOURCEBOT_STRUCTURED_LOGGING_ENABLED` | `false` | <p>Enables/disable structured JSON logging. See [this doc](/docs/configuration/structured-logging) for more info.</p> |
| `SOURCEBOT_STRUCTURED_LOGGING_FILE` | - | <p>Optional file to log to if structured logging is enabled</p> |
| `SOURCEBOT_TELEMETRY_DISABLED` | `false` | <p>Enables/disables telemetry collection in Sourcebot. See [this doc](/docs/overview.mdx#telemetry) for more info.</p> |
| `DEFAULT_MAX_MATCH_COUNT` | `10000` | <p>The default maximum number of search results to return when using search in the web app.</p> |
| `ALWAYS_INDEX_FILE_PATTERNS` | - | <p>A comma separated list of glob patterns matching file paths that should always be indexed, regardless of size or number of trigrams.</p> |
| `TOTAL_MAX_MATCH_COUNT` | `100000` | <p>The maximum number of matches per query</p> |
| `ZOEKT_MAX_WALL_TIME_MS` | `10000` | <p>The maximum real world duration (in milliseconds) per zoekt query</p> |
### Enterprise Environment Variables
| Variable | Default | Description |
@ -63,15 +65,12 @@ The following environment variables allow you to configure your Sourcebot deploy
### Review Agent Environment Variables
| Variable | Default | Description |
| :------- | :------ | :---------- |
| `GITHUB_REVIEW_AGENT_APP_ID` | `-` | <p>The GitHub App ID used for review agent authentication.</p> |
| `GITHUB_REVIEW_AGENT_APP_PRIVATE_KEY_PATH` | `-` | <p>The container relative path to the private key file for the GitHub App used by the review agent.</p> |
| `GITHUB_REVIEW_AGENT_APP_WEBHOOK_SECRET` | `-` | <p>The webhook secret for the GitHub App used by the review agent.</p> |
| `GITHUB_APP_ID` | `-` | <p>The GitHub App ID used for review agent authentication.</p> |
| `GITHUB_APP_PRIVATE_KEY_PATH` | `-` | <p>The container relative path to the private key file for the GitHub App used by the review agent.</p> |
| `GITHUB_APP_WEBHOOK_SECRET` | `-` | <p>The webhook secret for the GitHub App used by the review agent.</p> |
| `OPENAI_API_KEY` | `-` | <p>The OpenAI API key used by the review agent.</p> |
| `REVIEW_AGENT_API_KEY` | `-` | <p>The Sourcebot API key used by the review agent.</p> |
| `REVIEW_AGENT_AUTO_REVIEW_ENABLED` | `false` | <p>Enables/disables automatic code reviews by the review agent.</p> |
| `REVIEW_AGENT_LOGGING_ENABLED` | `true` | <p>Enables/disables logging for the review agent. Logs are saved in `DATA_CACHE_DIR/review-agent`</p> |
| `REVIEW_AGENT_REVIEW_COMMAND` | `review` | <p>The command used to trigger a code review by the review agent.</p> |
### Overriding environment variables from the config
You can override environment variables from the config file by using the `environmentOverrides` property. See [this doc](/docs/configuration/config-file#overriding-environment-variables-from-the-config) for more info.

View file

@ -1,418 +0,0 @@
---
title: External Identity Providers
sidebarTitle: External identity providers
---
import LicenseKeyRequired from '/snippets/license-key-required.mdx'
<LicenseKeyRequired />
You can connect Sourcebot to various **external identity providers** to associate a Sourcebot user with one or more external service accounts (ex. Google, GitHub, etc).
External identity providers can be used for [authentication](/docs/configuration/auth) and/or [permission syncing](/docs/features/permission-syncing). They're defined in the
[config file](/docs/configuration/config-file) in the top-level `identityProviders` object:
```json wrap icon="code" Example config with both google and github identity providers defined
{
"$schema": "https://raw.githubusercontent.com/sourcebot-dev/sourcebot/main/schemas/v3/index.json",
"identityProviders": [
{
"provider": "github",
"purpose": "account_linking",
"accountLinkingRequired": true,
"clientId": {
"env": "GITHUB_IDENTITY_PROVIDER_CLIENT_ID"
},
"clientSecret": {
"env": "GITHUB_IDENTITY_PROVIDER_CLIENT_SECRET"
}
},
{
"provider": "google",
"clientId": {
"env": "GOOGLE_IDENTITY_PROVIDER_CLIENT_ID"
},
"clientSecret": {
"env": "GOOGLE_IDENTITY_PROVIDER_CLIENT_SECRET"
}
}
]
}
```
Secret values (such as `clientId` and `clientSecret`) can be provided as environment variables or Google Cloud secrets via [tokens](/docs/configuration/config-file#tokens).
# Supported External Identity Providers
Sourcebot uses [Auth.js](https://authjs.dev/) to connect to external identity providers. If there's a provider supported by Auth.js that you don't see below, please submit a
[feature request](https://github.com/sourcebot-dev/sourcebot/issues) to have it added.
### GitHub
[Auth.js GitHub Provider Docs](https://authjs.dev/getting-started/providers/github)
A GitHub connection can be used for either [authentication](/docs/configuration/auth) or [permission syncing](/docs/features/permission-syncing). This is controlled using the `purpose` field
in the GitHub identity provider config.
<Accordion title="instructions">
<Steps>
<Step title="Register an Oauth Client">
To begin, you must register an Oauth client in GitHub to faciliate the identity provider connection. You can do this by creating a **GitHub App** or a **GitHub OAuth App**. Either
one works, but the **GitHub App** is the [recommended mechanism](https://docs.github.com/en/apps/oauth-apps/building-oauth-apps/differences-between-github-apps-and-oauth-apps).
The result of registering an OAuth client is a `CLIENT_ID` and `CLIENT_SECRET` which you'll provide to Sourcebot.
<Tabs>
<Tab title="GitHub App">
<Note>You don't need to install the app to use it as an external identity provider</Note>
Follow [this guide](https://docs.github.com/en/apps/creating-github-apps/registering-a-github-app/registering-a-github-app) to register a new GitHub App.
When asked to provide a callback url, provide `<sourcebot_url>/api/auth/callback/github` (ex. https://sourcebot.coolcorp.com/api/auth/callback/github)
Set the following fine-grained permissions in the GitHub App:
- `“Email addresses” account permissions (read)`
- `"Metadata" repository permissions (read)` (only needed if using permission syncing)
</Tab>
<Tab title="GitHub OAuth App">
Follow [this guide](https://docs.github.com/en/apps/oauth-apps/building-oauth-apps/creating-an-oauth-app) by GitHub to create an OAuth App.
When asked to provide a callback url, provide `<sourcebot_url>/api/auth/callback/github` (ex. https://sourcebot.coolcorp.com/api/auth/callback/github)
</Tab>
</Tabs>
</Step>
<Step title="Define environemnt variables">
To provide Sourcebot the client id and secret for your OAuth client you must set them as environment variables. These can be named whatever you like
(ex. `GITHUB_IDENTITY_PROVIDER_CLIENT_ID` and `GITHUB_IDENTITY_PROVIDER_CLIENT_SECRET`)
</Step>
<Step title="Define the identity provider config">
Finally, pass the client id and secret to Sourcebot by defining a `identityProvider` object in the [config file](/docs/configuration/config-file):
```json wrap icon="code"
{
"$schema": "https://raw.githubusercontent.com/sourcebot-dev/sourcebot/main/schemas/v3/index.json",
"identityProviders": [
{
"provider": "github",
// "sso" for auth + perm sync, "account_linking" for only perm sync
"purpose": "account_linking",
// if purpose == "account_linking" this controls if a user must connect to the IdP
"accountLinkingRequired": true,
"clientId": {
"env": "YOUR_CLIENT_ID_ENV_VAR"
},
"clientSecret": {
"env": "YOUR_CLIENT_SECRET_ENV_VAR"
}
}
]
}
```
</Step>
</Steps>
</Accordion>
### GitLab
[Auth.js GitLab Provider Docs](https://authjs.dev/getting-started/providers/gitlab)
A GitLab connection can be used for either [authentication](/docs/configuration/auth) or [permission syncing](/docs/features/permission-syncing). This is controlled using the `purpose` field
in the GitLab identity provider config.
<Accordion title="instructions">
<Steps>
<Step title="Register an OAuth Application">
To begin, you must register an OAuth application in GitLab to facilitate the identity provider connection.
Follow [this guide](https://docs.gitlab.com/integration/oauth_provider/) by GitLab to create an OAuth application.
When configuring your application:
- Set the callback URL to `<sourcebot_url>/api/auth/callback/gitlab` (ex. https://sourcebot.coolcorp.com/api/auth/callback/gitlab)
- Enable the `read_user` scope
- If using for permission syncing, also enable the `read_api` scope
The result of registering an OAuth application is an `APPLICATION_ID` (`CLIENT_ID`) and `SECRET` (`CLIENT_SECRET`) which you'll provide to Sourcebot.
</Step>
<Step title="Define environment variables">
To provide Sourcebot the client id and secret for your OAuth application you must set them as environment variables. These can be named whatever you like
(ex. `GITLAB_IDENTITY_PROVIDER_CLIENT_ID` and `GITLAB_IDENTITY_PROVIDER_CLIENT_SECRET`)
</Step>
<Step title="Define the identity provider config">
Finally, pass the client id and secret to Sourcebot by defining a `identityProvider` object in the [config file](/docs/configuration/config-file):
```json wrap icon="code"
{
"$schema": "https://raw.githubusercontent.com/sourcebot-dev/sourcebot/main/schemas/v3/index.json",
"identityProviders": [
{
"provider": "gitlab",
// "sso" for auth + perm sync, "account_linking" for only perm sync
"purpose": "account_linking",
// if purpose == "account_linking" this controls if a user must connect to the IdP
"accountLinkingRequired": true,
"clientId": {
"env": "YOUR_CLIENT_ID_ENV_VAR"
},
"clientSecret": {
"env": "YOUR_CLIENT_SECRET_ENV_VAR"
},
// Optional: for self-hosted GitLab instances
"baseUrl": "https://gitlab.example.com"
}
]
}
```
</Step>
</Steps>
</Accordion>
### Google
[Auth.js Google Provider Docs](https://authjs.dev/getting-started/providers/google)
A Google connection can be used for [authentication](/docs/configuration/auth).
<Accordion title="instructions">
<Steps>
<Step title="Register an OAuth Client">
To begin, you must register an OAuth client in Google Cloud Console to facilitate the identity provider connection.
Follow [this guide](https://support.google.com/cloud/answer/6158849) by Google to create OAuth 2.0 credentials.
When configuring your OAuth client:
- Set the application type to "Web application"
- Add `<sourcebot_url>/api/auth/callback/google` to the authorized redirect URIs (ex. https://sourcebot.coolcorp.com/api/auth/callback/google)
The result of creating OAuth credentials is a `CLIENT_ID` and `CLIENT_SECRET` which you'll provide to Sourcebot.
</Step>
<Step title="Define environment variables">
To provide Sourcebot the client id and secret for your OAuth client you must set them as environment variables. These can be named whatever you like
(ex. `GOOGLE_IDENTITY_PROVIDER_CLIENT_ID` and `GOOGLE_IDENTITY_PROVIDER_CLIENT_SECRET`)
</Step>
<Step title="Define the identity provider config">
Finally, pass the client id and secret to Sourcebot by defining a `identityProvider` object in the [config file](/docs/configuration/config-file):
```json wrap icon="code"
{
"$schema": "https://raw.githubusercontent.com/sourcebot-dev/sourcebot/main/schemas/v3/index.json",
"identityProviders": [
{
"provider": "google",
"purpose": "sso",
"clientId": {
"env": "YOUR_CLIENT_ID_ENV_VAR"
},
"clientSecret": {
"env": "YOUR_CLIENT_SECRET_ENV_VAR"
}
}
]
}
```
</Step>
</Steps>
</Accordion>
### Okta
[Auth.js Okta Provider Docs](https://authjs.dev/getting-started/providers/okta)
An Okta connection can be used for [authentication](/docs/configuration/auth).
<Accordion title="instructions">
<Steps>
<Step title="Register an OAuth Application">
To begin, you must register an OAuth application in Okta to facilitate the identity provider connection.
Follow [this guide](https://developer.okta.com/docs/guides/implement-oauth-for-okta/main/) by Okta to create an OAuth application.
When configuring your application:
- Set the application type to "Web Application"
- Add `<sourcebot_url>/api/auth/callback/okta` to the sign-in redirect URIs (ex. https://sourcebot.coolcorp.com/api/auth/callback/okta)
The result of creating an OAuth application is a `CLIENT_ID`, `CLIENT_SECRET`, and `ISSUER` URL which you'll provide to Sourcebot.
</Step>
<Step title="Define environment variables">
To provide Sourcebot the client id, client secret, and issuer for your OAuth application you must set them as environment variables. These can be named whatever you like
(ex. `OKTA_IDENTITY_PROVIDER_CLIENT_ID`, `OKTA_IDENTITY_PROVIDER_CLIENT_SECRET`, and `OKTA_IDENTITY_PROVIDER_ISSUER`)
</Step>
<Step title="Define the identity provider config">
Finally, pass the client id, client secret, and issuer to Sourcebot by defining a `identityProvider` object in the [config file](/docs/configuration/config-file):
```json wrap icon="code"
{
"$schema": "https://raw.githubusercontent.com/sourcebot-dev/sourcebot/main/schemas/v3/index.json",
"identityProviders": [
{
"provider": "okta",
"purpose": "sso",
"clientId": {
"env": "YOUR_CLIENT_ID_ENV_VAR"
},
"clientSecret": {
"env": "YOUR_CLIENT_SECRET_ENV_VAR"
},
"issuer": {
"env": "YOUR_ISSUER_ENV_VAR"
}
}
]
}
```
</Step>
</Steps>
</Accordion>
### Keycloak
[Auth.js Keycloak Provider Docs](https://authjs.dev/getting-started/providers/keycloak)
A Keycloak connection can be used for [authentication](/docs/configuration/auth).
<Accordion title="instructions">
<Steps>
<Step title="Register an OAuth Client">
To begin, you must register an OAuth client in Keycloak to facilitate the identity provider connection.
Follow [this guide](https://www.keycloak.org/docs/latest/server_admin/#_oidc_clients) by Keycloak to create an OpenID Connect client.
When configuring your client:
- Set the client protocol to "openid-connect"
- Set the access type to "confidential"
- Add `<sourcebot_url>/api/auth/callback/keycloak` to the valid redirect URIs (ex. https://sourcebot.coolcorp.com/api/auth/callback/keycloak)
The result of creating an OAuth client is a `CLIENT_ID`, `CLIENT_SECRET`, and an `ISSUER` URL (typically in the format `https://<keycloak-domain>/realms/<realm-name>`) which you'll provide to Sourcebot.
</Step>
<Step title="Define environment variables">
To provide Sourcebot the client id, client secret, and issuer for your OAuth client you must set them as environment variables. These can be named whatever you like
(ex. `KEYCLOAK_IDENTITY_PROVIDER_CLIENT_ID`, `KEYCLOAK_IDENTITY_PROVIDER_CLIENT_SECRET`, and `KEYCLOAK_IDENTITY_PROVIDER_ISSUER`)
</Step>
<Step title="Define the identity provider config">
Finally, pass the client id, client secret, and issuer to Sourcebot by defining a `identityProvider` object in the [config file](/docs/configuration/config-file):
```json wrap icon="code"
{
"$schema": "https://raw.githubusercontent.com/sourcebot-dev/sourcebot/main/schemas/v3/index.json",
"identityProviders": [
{
"provider": "keycloak",
"purpose": "sso",
"clientId": {
"env": "YOUR_CLIENT_ID_ENV_VAR"
},
"clientSecret": {
"env": "YOUR_CLIENT_SECRET_ENV_VAR"
},
"issuer": {
"env": "YOUR_ISSUER_ENV_VAR"
}
}
]
}
```
</Step>
</Steps>
</Accordion>
### Microsoft Entra ID
[Auth.js Microsoft Entra ID Provider Docs](https://authjs.dev/getting-started/providers/microsoft-entra-id)
A Microsoft Entra ID connection can be used for [authentication](/docs/configuration/auth).
<Accordion title="instructions">
<Steps>
<Step title="Register an OAuth Application">
To begin, you must register an OAuth application in Microsoft Entra ID (formerly Azure Active Directory) to facilitate the identity provider connection.
Follow [this guide](https://learn.microsoft.com/en-us/entra/identity-platform/quickstart-register-app) by Microsoft to register an application.
When configuring your application:
- Under "Authentication", add a platform and select "Web"
- Set the redirect URI to `<sourcebot_url>/api/auth/callback/microsoft-entra-id` (ex. https://sourcebot.coolcorp.com/api/auth/callback/microsoft-entra-id)
- Under "Certificates & secrets", create a new client secret
The result of registering an application is a `CLIENT_ID` (Application ID), `CLIENT_SECRET`, and `TENANT_ID` which you'll use to construct the issuer URL.
</Step>
<Step title="Define environment variables">
To provide Sourcebot the client id, client secret, and issuer for your OAuth application you must set them as environment variables. These can be named whatever you like
(ex. `MICROSOFT_ENTRA_ID_IDENTITY_PROVIDER_CLIENT_ID`, `MICROSOFT_ENTRA_ID_IDENTITY_PROVIDER_CLIENT_SECRET`, and `MICROSOFT_ENTRA_ID_IDENTITY_PROVIDER_ISSUER`)
The issuer URL should be in the format: `https://login.microsoftonline.com/<TENANT_ID>/v2.0`
</Step>
<Step title="Define the identity provider config">
Finally, pass the client id, client secret, and issuer to Sourcebot by defining a `identityProvider` object in the [config file](/docs/configuration/config-file):
```json wrap icon="code"
{
"$schema": "https://raw.githubusercontent.com/sourcebot-dev/sourcebot/main/schemas/v3/index.json",
"identityProviders": [
{
"provider": "microsoft-entra-id",
"purpose": "sso",
"clientId": {
"env": "YOUR_CLIENT_ID_ENV_VAR"
},
"clientSecret": {
"env": "YOUR_CLIENT_SECRET_ENV_VAR"
},
"issuer": {
"env": "YOUR_ISSUER_ENV_VAR"
}
}
]
}
```
</Step>
</Steps>
</Accordion>
### Authentik
[Auth.js Authentik Provider Docs](https://authjs.dev/getting-started/providers/authentik)
An Authentik connection can be used for [authentication](/docs/configuration/auth).
<Accordion title="instructions">
<Steps>
<Step title="Create a OAuth2/OpenID Connect application">
To begin, you must create a OAuth2/OpenID Connect application in Authentik. For more information, see the [Authentik documentation](https://docs.goauthentik.io/add-secure-apps/applications/manage_apps/#create-an-application-and-provider-pair).
When configuring your application:
- Set the provider type to "OAuth2/OpenID Connect"
- Set the client type to "Confidential"
- Add `<sourcebot_url>/api/auth/callback/authentik` to the redirect URIs (ex. https://sourcebot.coolcorp.com/api/auth/callback/authentik)
After creating the application, open the application details to obtain the client id, client secret, and issuer URL (typically in the format `https://<authentik-domain>/application/o/<provider-slug>/`).
</Step>
<Step title="Define environment variables">
The client id, secret, and issuer URL are provided to Sourcebot via environment variables. These can be named whatever you like
(ex. `AUTHENTIK_IDENTITY_PROVIDER_CLIENT_ID`, `AUTHENTIK_IDENTITY_PROVIDER_CLIENT_SECRET`, and `AUTHENTIK_IDENTITY_PROVIDER_ISSUER`)
</Step>
<Step title="Define the identity provider config">
Create a `identityProvider` object in the [config file](/docs/configuration/config-file) with the following fields:
```json wrap icon="code"
{
"$schema": "https://raw.githubusercontent.com/sourcebot-dev/sourcebot/main/schemas/v3/index.json",
"identityProviders": [
{
"provider": "authentik",
"purpose": "sso",
"clientId": {
"env": "AUTHENTIK_IDENTITY_PROVIDER_CLIENT_ID"
},
"clientSecret": {
"env": "AUTHENTIK_IDENTITY_PROVIDER_CLIENT_SECRET"
},
"issuer": {
"env": "AUTHENTIK_IDENTITY_PROVIDER_ISSUER"
}
}
]
}
```
</Step>
</Steps>
</Accordion>

View file

@ -292,7 +292,6 @@ The OpenAI compatible provider allows you to use any model that is compatible wi
<Accordion title="Troubleshooting">
- When using [llama.cpp](https://github.com/ggml-org/llama.cpp), if you hit "Failed after 3 attempts. Last error: tools param requires --jinja flag", add the `--jinja` flag to your `llama-server` command.
- If you're seeing the LLM outputing reasoning tokens wrapped in XML tags (e.g., `<reasoning>`, `<thinking>`, etc.), you can configure the `reasoningTag` parameter to the name of the tag (without angle brackets). This parameter defaults to `think`.
</Accordion>
### OpenRouter

View file

@ -86,7 +86,7 @@ If you're not familiar with Sourcebot [connections](/docs/connections/overview),
Azure Devops Cloud requires you to provide a PAT in order to index your repositories. To learn how to create PAT, check out the [Azure Devops docs](https://learn.microsoft.com/en-us/azure/devops/organizations/accounts/use-personal-access-tokens-to-authenticate?view=azure-devops&tabs=Windows).
Sourcebot needs the `Read` access for the `Code` scope in order to find and clone your repos.
Next, provide the access [token](/docs/configuration/config-file#tokens) via an environment variable which is referenced in the `token` property:
Next, provide the access token via the `token` property, either as an environment variable or a secret:
<Tabs>
<Tab title="Environment Variable">
@ -113,6 +113,28 @@ Next, provide the access [token](/docs/configuration/config-file#tokens) via an
ghcr.io/sourcebot-dev/sourcebot:latest
```
</Tab>
<Tab title="Secret">
<Note>Secrets are only supported when [authentication](/docs/configuration/auth/overview) is enabled.</Note>
1. Navigate to **Secrets** in settings and create a new secret with your PAT:
![](/images/secrets_list.png)
2. Add the `token` property to your connection config:
```json
{
"type": "azuredevops",
"deploymentType": "cloud",
"token": {
"secret": "mysecret"
}
// .. rest of config ..
}
```
</Tab>
</Tabs>
## Schema reference

View file

@ -100,7 +100,7 @@ If you're not familiar with Sourcebot [connections](/docs/connections/overview),
Azure Devops Server requires you to provide a PAT in order to index your repositories. To learn how to create PAT, check out the [Azure Devops docs](https://learn.microsoft.com/en-us/azure/devops/organizations/accounts/use-personal-access-tokens-to-authenticate?view=azure-devops&tabs=Windows).
Sourcebot needs the `Read` access for the `Code` scope in order to find and clone your repos.
Next, provide the access [token](/docs/configuration/config-file#tokens) via an environment variable which is referenced in the `token` property:
Next, provide the access token via the `token` property, either as an environment variable or a secret:
<Tabs>
<Tab title="Environment Variable">
@ -127,6 +127,28 @@ Next, provide the access [token](/docs/configuration/config-file#tokens) via an
ghcr.io/sourcebot-dev/sourcebot:latest
```
</Tab>
<Tab title="Secret">
<Note>Secrets are only supported when [authentication](/docs/configuration/auth/overview) is enabled.</Note>
1. Navigate to **Secrets** in settings and create a new secret with your PAT:
![](/images/secrets_list.png)
2. Add the `token` property to your connection config:
```json
{
"type": "azuredevops",
"deploymentType": "server",
"token": {
"secret": "mysecret"
}
// .. rest of config ..
}
```
</Tab>
</Tabs>
## Schema reference

View file

@ -78,7 +78,7 @@ If you're not familiar with Sourcebot [connections](/docs/connections/overview),
## Authenticating with Bitbucket Cloud
In order to index private repositories, you'll need to provide authentication credentials via a [token](/docs/configuration/config-file#tokens). You can do this using an `App Password` or an `Access Token`
In order to index private repositories, you'll need to provide authentication credentials. You can do this using an `App Password` or an `Access Token`
<Tabs>
<Tab title="App Password">

View file

@ -70,7 +70,7 @@ If you're not familiar with Sourcebot [connections](/docs/connections/overview),
## Authenticating with Bitbucket Data Center
In order to index private repositories, you'll need to provide an access token to Sourcebot via a [token](/docs/configuration/config-file#tokens).
In order to index private repositories, you'll need to provide an access token to Sourcebot.
Create an access token for the desired scope (repo, project, or workspace). Visit the official [Bitbucket Data Center docs](https://confluence.atlassian.com/bitbucketserver/http-access-tokens-939515499.html)
for more info.

View file

@ -81,7 +81,7 @@ In order to index private repositories, you'll need to generate a Gitea access t
![Gitea Access token creation](/images/gitea_pat_creation.png)
Next, provide the access token via an environment variable [token](/docs/configuration/config-file#tokens) which is referenced in the `token` property:
Next, provide the access token via the `token` property, either as an environment variable or a secret:
<Tabs>
<Tab title="Environment Variable">
@ -107,6 +107,27 @@ Next, provide the access token via an environment variable [token](/docs/configu
ghcr.io/sourcebot-dev/sourcebot:latest
```
</Tab>
<Tab title="Secret">
<Note>Secrets are only supported when [authentication](/docs/configuration/auth/overview) is enabled.</Note>
1. Navigate to **Secrets** in settings and create a new secret with your PAT:
![](/images/secrets_list.png)
2. Add the `token` property to your connection config:
```json
{
"type": "gitea",
"token": {
"secret": "mysecret"
}
// .. rest of config ..
}
```
</Tab>
</Tabs>
## Connecting to a custom Gitea

View file

@ -128,7 +128,7 @@ In order to index private repositories, you'll need to generate a access token a
</Accordion>
</AccordionGroup>
Next, provide the access token via an environment variable [token](/docs/configuration/config-file#tokens) which is referenced in the `token` property:
Next, provide the access token via the `token` property, either as an environment variable or a secret:
<Tabs>
<Tab title="Environment Variable">
@ -154,6 +154,27 @@ Next, provide the access token via an environment variable [token](/docs/configu
ghcr.io/sourcebot-dev/sourcebot:latest
```
</Tab>
<Tab title="Secret">
<Note>Secrets are only supported when [authentication](/docs/configuration/auth/overview) is enabled.</Note>
1. Navigate to **Secrets** in settings and create a new secret with your PAT:
![](/images/secrets_list.png)
2. Add the `token` property to your connection config:
```json
{
"type": "github",
"token": {
"secret": "mysecret"
}
// .. rest of config ..
}
```
</Tab>
</Tabs>
## Connecting to a custom GitHub host

View file

@ -116,7 +116,7 @@ In order to index private projects, you'll need to generate a GitLab Personal Ac
![GitLab PAT Scope](/images/gitlab_pat_scopes.png)
Next, provide the PAT via an environment variable [token](/docs/configuration/config-file#tokens) which is referenced in the `token` property:
Next, provide the PAT via the `token` property, either as an environment variable or a secret:
<Tabs>
<Tab title="Environment Variable">
@ -142,6 +142,27 @@ Next, provide the PAT via an environment variable [token](/docs/configuration/co
ghcr.io/sourcebot-dev/sourcebot:latest
```
</Tab>
<Tab title="Secret">
<Note>Secrets are only supported when [authentication](/docs/configuration/auth/overview) is enabled.</Note>
1. Navigate to **Secrets** in settings and create a new secret with your PAT:
![](/images/secrets_list.png)
2. Add the `token` property to your connection config:
```json
{
"type": "gitlab",
"token": {
"secret": "mysecret"
}
// .. rest of config ..
}
```
</Tab>
</Tabs>
## Connecting to a custom GitLab host

View file

@ -69,26 +69,6 @@ To learn more about how to create a connection for a specific code host, check o
<Note>Missing your code host? [Submit a feature request on GitHub](https://github.com/sourcebot-dev/sourcebot/issues/new?template=feature_request.md).</Note>
## Indexing Large Files
By default, Sourcebot will skip indexing files that are larger than 2MB or have more than 20,000 trigrams. You can configure this by setting the `maxFileSize` and `maxTrigramCount` [settings](/docs/configuration/config-file#settings).
These limits can be ignored for specific files by passing in a comma separated list of glob patterns matching file paths to the `ALWAYS_INDEX_FILE_PATTERNS` environment variable. For example:
```bash
# Always index all .sum and .lock files
ALWAYS_INDEX_FILE_PATTERNS=**/*.sum,**/*.lock
```
Files that have been skipped are assigned the `skipped` language. You can view a list of all skipped files by using the following query:
```
lang:skipped
```
## Indexing Binary Files
Binary files cannot be indexed by Sourcebot. See [#575](https://github.com/sourcebot-dev/sourcebot/issues/575) for more information.
## Schema reference
---

View file

@ -0,0 +1,88 @@
---
title: "Deployment guide"
---
import SupportedPlatforms from '/snippets/platform-support.mdx'
The following guide will walk you through the steps to deploy Sourcebot on your own infrastructure. Sourcebot is distributed as a [single docker container](/docs/overview#architecture) that can be deployed to a k8s cluster, a VM, or any platform that supports docker.
<Note>Hit an issue? Please let us know on [GitHub](https://github.com/sourcebot-dev/sourcebot/issues/new/choose) or by [emailing us](mailto:team@sourcebot.dev).</Note>
<Steps>
<Step title="Requirements">
- Docker -> use [Docker Desktop](https://www.docker.com/products/docker-desktop/) on Mac or Windows.
</Step>
<Step title="Create a config.json">
Create a `config.json` file that tells Sourcebot which repositories to sync and index:
```bash wrap icon="terminal" Create example config
touch config.json
echo '{
"$schema": "https://raw.githubusercontent.com/sourcebot-dev/sourcebot/main/schemas/v3/index.json",
"connections": {
// comments are supported
"starter-connection": {
"type": "github",
"repos": [
"sourcebot-dev/sourcebot"
]
}
}
}' > config.json
```
This config creates a single GitHub connection named `starter-connection` that specifies [Sourcebot](https://github.com/sourcebot-dev/sourcebot) as a repo to sync. [Learn more about the config file](/docs/configuration/config-file).
</Step>
<Step title="Launch your instance">
<Warning>If you're deploying Sourcebot behind a domain, you must set the [AUTH_URL](/docs/configuration/environment-variables) environment variable.</Warning>
In the same directory as `config.json`, run the following command to start your instance:
``` bash icon="terminal" Start the Sourcebot container
docker run \
-p 3000:3000 \
--pull=always \
--rm \
-v $(pwd):/data \
-e CONFIG_PATH=/data/config.json \
--name sourcebot \
ghcr.io/sourcebot-dev/sourcebot:latest
```
<Accordion title="Details">
**This command**:
- pulls the latest version of the `sourcebot` docker image.
- mounts the working directory to `/data` in the container to allow Sourcebot to persist data across restarts, and to access the `config.json`. In your local directory, you should see a `.sourcebot` folder created that contains all persistent data.
- runs any pending database migrations.
- starts up all services, including the webserver exposed on port 3000.
- reads `config.json` and starts syncing.
</Accordion>
</Step>
<Step title="Complete onboarding">
Navigate to `http://localhost:3000` and complete the onboarding flow.
</Step>
<Step title="Done">
You're all set! If you'd like to setup [Ask Sourcebot](/docs/features/ask/overview), configure a language model [provider](/docs/configuration/language-model-providers).
</Step>
</Steps>
## Next steps
---
<CardGroup cols={3}>
<Card title="Index your code" icon="code" href="/docs/connections/overview">
Learn how to index your code using Sourcebot
</Card>
<Card title="Language models" icon="brain" href="/docs/configuration/language-model-providers">
Learn how to configure language model providers to start using [Ask Sourcebot](/docs/features/ask/overview)
</Card>
<Card title="Authentication" icon="lock" href="/docs/configuration/auth/overview">
Learn more about how to setup SSO, email codes, and other authentication providers.
</Card>
</CardGroup>

View file

@ -1,61 +0,0 @@
---
title: "Docker Compose"
---
This guide will walk you through deploying Sourcebot locally or on a VM using Docker Compose. We will use the [docker-compose.yml](https://github.com/sourcebot-dev/sourcebot/blob/main/docker-compose.yml) file from the Sourcebot repository. This is the simplest way to get started with Sourcebot.
If you are looking to deploy onto Kubernetes, see the [Kubernetes (Helm)](/docs/deployment/k8s) guide.
## Get started
<Steps>
<Step title="Requirements">
- docker & docker compose. Use [Docker Desktop](https://www.docker.com/products/docker-desktop/) on Mac or Windows.
</Step>
<Step title="Obtain the Docker Compose file">
Download the [docker-compose.yml](https://github.com/sourcebot-dev/sourcebot/blob/main/docker-compose.yml) file from the Sourcebot repository.
```bash wrap icon="terminal"
curl -o docker-compose.yml https://raw.githubusercontent.com/sourcebot-dev/sourcebot/main/docker-compose.yml
```
</Step>
<Step title="Create a config.json">
In the same directory as the `docker-compose.yml` file, create a [configuration file](/docs/configuration/config-file). The configuration file is a JSON file that configures Sourcebot's behaviour, including what repositories to index, language model providers, auth providers, and more.
```bash wrap icon="terminal" Create example config
touch config.json
echo '{
"$schema": "https://raw.githubusercontent.com/sourcebot-dev/sourcebot/main/schemas/v3/index.json",
// Comments are supported.
// This config creates a single connection to GitHub.com that
// indexes the Sourcebot repository
"connections": {
"starter-connection": {
"type": "github",
"repos": [
"sourcebot-dev/sourcebot"
]
}
}
}' > config.json
```
</Step>
<Step title="Launch your instance">
Update the secrets in the `docker-compose.yml` and then run Sourcebot using:
```bash wrap icon="terminal"
docker compose up
```
</Step>
<Step title="Done">
You're all set! Navigate to [http://localhost:3000](http://localhost:3000) to access your Sourcebot instance.
</Step>
</Steps>
## Next steps

View file

@ -1,4 +0,0 @@
---
title: "Kubernetes (Helm)"
url: https://github.com/sourcebot-dev/sourcebot-helm-chart
---

View file

@ -10,7 +10,7 @@ codebase that the agent may fetch to perform the review.
This agent provides codebase-aware reviews for your PRs. For each diff, this agent fetches relevant context from Sourcebot and feeds it into an LLM for a detailed review of your changes.
The AI Code Review Agent is [fair source](https://github.com/sourcebot-dev/sourcebot/tree/main/packages/web/src/features/agents/review-agent) and packaged in [Sourcebot](https://github.com/sourcebot-dev/sourcebot). To get started using this agent, [deploy Sourcebot](/docs/deployment/docker-compose)
The AI Code Review Agent is [fair source](https://github.com/sourcebot-dev/sourcebot/tree/main/packages/web/src/features/agents/review-agent) and packaged in [Sourcebot](https://github.com/sourcebot-dev/sourcebot). To get started using this agent, [deploy Sourcebot](/docs/deployment-guide)
and then follow the configuration instructions below.
![AI Code Review Agent Example](/images/review_agent_example.png)
@ -44,9 +44,9 @@ Before you get started, make sure you have an OpenAPI account that you can creat
<Step title="Configure the environment variables in Sourcebot">
Sourcebot requires the following environment variables to begin reviewing PRs through your new GitHub app:
- `GITHUB_REVIEW_AGENT_APP_ID`: The client ID of your GitHub app. Can be found in your [app settings](https://docs.github.com/en/apps/creating-github-apps/writing-code-for-a-github-app/quickstart#navigate-to-your-app-settings)
- `GITHUB_REVIEW_AGENT_APP_WEBHOOK_SECRET`: The webhook secret you defined in your GitHub app. Can be found in your [app settings](https://docs.github.com/en/apps/creating-github-apps/writing-code-for-a-github-app/quickstart#navigate-to-your-app-settings)
- `GITHUB_REVIEW_AGENT_APP_PRIVATE_KEY_PATH`: The path to your app's private key. If you're running Sourcebot from a container, this is the path to this file from within your container
- `GITHUB_APP_ID`: The client ID of your GitHub app. Can be found in your [app settings](https://docs.github.com/en/apps/creating-github-apps/writing-code-for-a-github-app/quickstart#navigate-to-your-app-settings)
- `GITHUB_APP_WEBHOOK_SECRET`: The webhook secret you defined in your GitHub app. Can be found in your [app settings](https://docs.github.com/en/apps/creating-github-apps/writing-code-for-a-github-app/quickstart#navigate-to-your-app-settings)
- `GITHUB_APP_PRIVATE_KEY_PATH`: The path to your app's private key. If you're running Sourcebot from a container, this is the path to this file from within your container
(ex `/data/review-agent-key.pem`). You must copy the private key file into the directory you mount to Sourcebot (similar to the config file).
You can generate a private key file for your app in the [app settings](https://docs.github.com/en/apps/creating-github-apps/writing-code-for-a-github-app/quickstart#navigate-to-your-app-settings). You must copy this private key file into the
@ -74,9 +74,9 @@ Before you get started, make sure you have an OpenAPI account that you can creat
- "/Users/michael/sourcebot_review_agent_workspace:/data"
environment:
CONFIG_PATH: "/data/config.json"
GITHUB_REVIEW_AGENT_APP_ID: "my-github-app-id"
GITHUB_REVIEW_AGENT_APP_WEBHOOK_SECRET: "my-github-app-webhook-secret"
GITHUB_REVIEW_AGENT_APP_PRIVATE_KEY_PATH: "/data/review-agent-key.pem"
GITHUB_APP_ID: "my-github-app-id"
GITHUB_APP_WEBHOOK_SECRET: "my-github-app-webhook-secret"
GITHUB_APP_PRIVATE_KEY_PATH: "/data/review-agent-key.pem"
REVIEW_AGENT_API_KEY: "sourcebot-my-key"
OPENAI_API_KEY: "sk-proj-my-open-api-key"
```

View file

@ -14,7 +14,7 @@ follow code nav references, and provide an answer thats rich with inline cita
<Card title="Index repos" icon="book" href="/docs/connections/overview" horizontal="true">
Learn how to index your repos so you can ask questions about them
</Card>
<Card title="Deployment guide" icon="server" href="/docs/deployment/docker-compose" horizontal="true">
<Card title="Deployment guide" icon="server" href="/docs/deployment-guide" horizontal="true">
Learn how to self-host Sourcebot in a few simple steps.
</Card>
<Card title="Public demo" icon="globe" href="https://demo.sourcebot.dev/" horizontal="true">

View file

@ -21,7 +21,6 @@ import LicenseKeyRequired from '/snippets/license-key-required.mdx'
| **Go to definition** | Clicking the "go to definition" button in the popover or clicking the symbol name navigates to the symbol's definition. |
| **Find references** | Clicking the "find all references" button in the popover lists all references in the explore panel. |
| **Explore panel** | Lists all references and definitions for the symbol selected in the popover. |
| **Cross-repository navigation** | You can search across all repositories by clicking the globe icon in the explore panel. By default, references and definitions are scoped to the repository where the symbol is being resolved. |
## How does it work?

View file

@ -9,7 +9,7 @@ The [Model Context Protocol](https://modelcontextprotocol.io/introduction) (MCP)
<Steps>
<Step title="Launch Sourcebot">
Follow the [deployment guides](/docs/deployment/docker-compose) to launch Sourcebot and get your code indexed. The host url of your instance (e.g., `http://localhost:3000`) is passed to the MCP server via the `SOURCEBOT_HOST` url.
Follow the [deployment guide](/docs/deployment-guide) to launch Sourcebot and get your code indexed. The host url of your instance (e.g., `http://localhost:3000`) is passed to the MCP server via the `SOURCEBOT_HOST` url.
If a host is not provided, then the server will fallback to using the demo instance hosted at https://demo.sourcebot.dev. You can see the list of repositories indexed [here](https://demo.sourcebot.dev/~/repos). Add additional repositories by [opening a PR](https://github.com/sourcebot-dev/sourcebot/blob/main/demo-site-config.json).
</Step>

View file

@ -1,20 +1,21 @@
---
title: "Permission syncing"
sidebarTitle: "Permission syncing"
tag: "experimental"
---
import LicenseKeyRequired from '/snippets/license-key-required.mdx'
import ExperimentalFeatureWarning from '/snippets/experimental-feature-warning.mdx'
<LicenseKeyRequired />
<ExperimentalFeatureWarning />
# Overview
Permission syncing allows you to sync Access Permission Lists (ACLs) from a code host to Sourcebot. When configured, users signed into Sourcebot will only be able to access repositories
that they have access to on the code host. Practically, this means:
Permission syncing allows you to sync Access Permission Lists (ACLs) from a code host to Sourcebot. When configured, users signed into Sourcebot (via the code host's OAuth provider) will only be able to access repositories that they have access to on the code host. Practically, this means:
- Code Search results will only include repositories that the user has access to.
- Code navigation results will only include repositories that the user has access to.
- MCP results will only include results from repositories the user has access to.
- Ask Sourcebot (and the underlying LLM) will only have access to repositories that the user has access to.
- File browsing is scoped to the repositories that the user has access to.
@ -34,7 +35,7 @@ We are actively working on supporting more code hosts. If you'd like to see a sp
| Platform | Permission syncing |
|:----------|------------------------------|
| [GitHub (GHEC & GHEC Server)](/docs/features/permission-syncing#github) | ✅ |
| [GitLab (Self-managed & Cloud)](/docs/features/permission-syncing#gitlab) | ✅ |
| GitLab | 🛑 |
| Bitbucket Cloud | 🛑 |
| Bitbucket Data Center | 🛑 |
| Gitea | 🛑 |
@ -45,7 +46,7 @@ We are actively working on supporting more code hosts. If you'd like to see a sp
## GitHub
Prerequisite: Configure GitHub as an [external identity provider](/docs/configuration/idp).
Prerequisite: [Add GitHub as an OAuth provider](/docs/configuration/auth/providers#github).
Permission syncing works with **GitHub.com**, **GitHub Enterprise Cloud**, and **GitHub Enterprise Server**. For organization-owned repositories, users that have **read-only** access (or above) via the following methods will have their access synced to Sourcebot:
- Outside collaborators
@ -55,21 +56,9 @@ Permission syncing works with **GitHub.com**, **GitHub Enterprise Cloud**, and *
- Organization owners.
**Notes:**
- A GitHub [external identity provider](/docs/configuration/idp) must be configured to (1) correlate a Sourcebot user with a GitHub user, and (2) to list repositories that the user has access to for [User driven syncing](/docs/features/permission-syncing#how-it-works).
- A GitHub OAuth provider must be configured to (1) correlate a Sourcebot user with a GitHub user, and (2) to list repositories that the user has access to for [User driven syncing](/docs/features/permission-syncing#how-it-works).
- OAuth tokens must assume the `repo` scope in order to use the [List repositories for the authenticated user API](https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repositories-for-the-authenticated-user) during [User driven syncing](/docs/features/permission-syncing#how-it-works). Sourcebot **will only** use this token for **reads**.
## GitLab
Prerequisite: Configure GitLab as an [external identity provider](/docs/configuration/idp).
Permission syncing works with **GitLab Self-managed** and **GitLab Cloud**. Users with **Guest** role or above with membership to a group or project will have their access synced to Sourcebot. Both direct and indirect membership to a group or project will be synced with Sourcebot. For more details, see the [GitLab docs](https://docs.gitlab.com/user/project/members/#membership-types).
**Notes:**
- A GitLab [external identity provider](/docs/configuration/idp) must be configured to (1) correlate a Sourcebot user with a GitLab user, and (2) to list repositories that the user has access to for [User driven syncing](/docs/features/permission-syncing#how-it-works).
- OAuth tokens require the `read_api` scope in order to use the [List projects for the authenticated user API](https://docs.gitlab.com/ee/api/projects.html#list-all-projects) during [User driven syncing](/docs/features/permission-syncing#how-it-works).
# How it works
Permission syncing works by periodically syncing ACLs from the code host(s) to Sourcebot to build an internal mapping between Users and Repositories. This mapping is hydrated in two directions:

View file

@ -22,7 +22,7 @@ Search across all your repos/branches across any code host platform. Blazingly f
<Card title="Branches" icon="split" href="/docs/features/search/multi-branch-indexing" horizontal="true">
Learn how to index and search through your branches
</Card>
<Card title="Deployment guides" icon="server" href="/docs/deployment/docker-compose" horizontal="true">
<Card title="Deployment guide" icon="server" href="/docs/deployment-guide" horizontal="true">
Learn how to self-host Sourcebot in a few simple steps.
</Card>
<Card title="Public demo" icon="globe" href="https://demo.sourcebot.dev/" horizontal="true">

View file

@ -4,51 +4,32 @@ title: Writing search queries
Sourcebot uses a powerful regex-based query language that enabled precise code search within large codebases.
## Syntax reference guide
Queries consist of space-separated search patterns that are matched against file contents. A file must have at least one match for each expression to be included. Queries can optionally contain search filters to further refine the search results.
## Keyword search (default)
Keyword search matches search patterns exactly in file contents. Wrapping search patterns in `""` combines them as a single expression.
| Example | Explanation |
| :--- | :--- |
| `foo` | Match files containing the keyword `foo` |
| `foo bar` | Match files containing both `foo` **and** `bar` |
| `"foo bar"` | Match files containing the phrase `foo bar` |
| `"foo \"bar\""` | Match files containing `foo "bar"` exactly (escaped quotes) |
## Regex search
Toggle the regex button (`.*`) in the search bar to interpret search patterns as regular expressions.
Queries consist of space-separated regular expressions. Wrapping expressions in `""` combines them. By default, a file must have at least one match for each expression to be included.
| Example | Explanation |
| :--- | :--- |
| `foo` | Match files with regex `/foo/` |
| `foo.*bar` | Match files with regex `/foo.*bar/` (foo followed by any characters, then bar) |
| `^function\s+\w+` | Match files with regex `/^function\s+\w+/` (function at start of line, followed by whitespace and word characters) |
| `"foo bar"` | Match files with regex `/foo bar/`. Quotes are not matched. |
| `foo bar` | Match files with regex `/foo/` **and** `/bar/` |
| `"foo bar"` | Match files with regex `/foo bar/` |
## Search filters
Multiple expressions can be or'd together with `or`, negated with `-`, or grouped with `()`.
Search queries (keyword or regex) can include multiple search filters to further refine the search results. Some filters can be negated using the `-` prefix.
| Example | Explanation |
| :--- | :--- |
| `foo or bar` | Match files with regex `/foo/` **or** `/bar/` |
| `foo -bar` | Match files with regex `/foo/` but **not** `/bar/` |
| `foo (bar or baz)` | Match files with regex `/foo/` **and** either `/bar/` **or** `/baz/` |
Expressions can be prefixed with certain keywords to modify search behavior. Some keywords can be negated using the `-` prefix.
| Prefix | Description | Example |
| :--- | :--- | :--- |
| `file:` | Filter results from filepaths that match the regex. By default all files are searched. | `file:README` - Filter results to filepaths that match regex `/README/`<br/>`file:"my file"` - Filter results to filepaths that match regex `/my file/`<br/>`-file:test\.ts$` - Ignore results from filepaths match regex `/test\.ts$/` |
| `repo:` | Filter results from repos that match the regex. By default all repos are searched. | `repo:linux` - Filter results to repos that match regex `/linux/`<br/>`-repo:^web/.*` - Ignore results from repos that match regex `/^web\/.*/` |
| `repo:` | Filter results from repos that match the regex. By default all repos are searched. | `repo:linux` - Filter results to repos that match regex `/linux/`<br/>`-repo:^web/.*` - Ignore results from repos that match regex `/^web\/.*` |
| `rev:` | Filter results from a specific branch or tag. By default **only** the default branch is searched. | `rev:beta` - Filter results to branches that match regex `/beta/` |
| `lang:` | Filter results by language (as defined by [linguist](https://github.com/github-linguist/linguist/blob/main/lib/linguist/languages.yml)). By default all languages are searched. | `lang:TypeScript` - Filter results to TypeScript files<br/>`-lang:YAML` - Ignore results from YAML files |
| `sym:` | Match symbol definitions created by [universal ctags](https://ctags.io/) at index time. | `sym:\bmain\b` - Filter results to symbols that match regex `/\bmain\b/` |
| `context:` | Filter results to a predefined [search context](/docs/features/search/search-contexts). | `context:web` - Filter results to the web context<br/>`-context:pipelines` - Ignore results from the pipelines context |
## Boolean operators & grouping
By default, space-separated expressions are and'd together. Using the `or` keyword as well as parentheses `()` can be used to create more complex boolean logic. Parentheses can be negated using the `-` prefix.
| Example | Explanation |
| :--- | :--- |
| `foo or bar` | Match files containing `foo` **or** `bar` |
| `foo (bar or baz)` | Match files containing `foo` **and** either `bar` **or** `baz`. |
| `-(foo) bar` | Match files containing `bar` **and not** `foo`. |
| `context:` | Filter results to a predefined [search context](/docs/features/search/search-contexts). | `context:web` - Filter results to the web context<br/>`-context:pipelines` - Ignore results from the pipelines context |

View file

@ -7,7 +7,7 @@ sidebarTitle: License key
If you'd like a trial license, [reach out](https://www.sourcebot.dev/contact) and we'll send one over within 24 hours
</Note>
All core Sourcebot features are available under the [FSL license](https://github.com/sourcebot-dev/sourcebot/blob/main/LICENSE.md#functional-source-license-version-11-alv2-future-license). Some additional features require a license key. See the [pricing page](https://www.sourcebot.dev/pricing) for more details.
All core Sourcebot features are available [FSL licensed](https://github.com/sourcebot-dev/sourcebot/blob/main/LICENSE.md#functional-source-license-version-11-alv2-future-license) without any limits. Some additional features require a license key. See the [pricing page](https://www.sourcebot.dev/pricing) for more details.
## Activating a license key
@ -25,7 +25,7 @@ docker run \
## Feature availability
---
| Feature | [FSL](https://github.com/sourcebot-dev/sourcebot/blob/main/LICENSE.md#functional-source-license-version-11-alv2-future-license) | [Enterprise](https://github.com/sourcebot-dev/sourcebot/blob/main/ee/LICENSE) |
| Feature | OSS | Licensed |
|:---------|:-----|:----------|
| [Search](/docs/features/search/syntax-reference) | ✅ | ✅ |
| [Full code host support](/docs/connections/overview) | ✅ | ✅ |
@ -34,7 +34,6 @@ docker run \
| [Login with credentials](/docs/configuration/auth/overview) | ✅ | ✅ |
| [Login with email codes](/docs/configuration/auth/overview) | ✅ | ✅ |
| [Login with SSO](/docs/configuration/auth/overview#enterprise-authentication-providers) | 🛑 | ✅ |
| [Permission syncing](/docs/features/permission-syncing) | 🛑 | ✅ |
| [Code navigation](/docs/features/code-navigation) | 🛑 | ✅ |
| [Search contexts](/docs/features/search/search-contexts) | 🛑 | ✅ |
| [Audit logs](/docs/configuration/audit-logs) | 🛑 | ✅ |

View file

@ -2,14 +2,13 @@
title: "Overview"
---
[Sourcebot](https://github.com/sourcebot-dev/sourcebot) is a platform that helps humans and agents understand your codebase:
[Sourcebot](https://github.com/sourcebot-dev/sourcebot) is a self-hosted tool that helps you understand your codebase.
- [Code search](/docs/features/search/overview): Search and navigate across all your repos and branches, no matter where theyre hosted
- [Ask Sourcebot](/docs/features/ask): Ask questions about your codebase and have Sourcebot provide detailed answers grounded with inline citations
- [MCP](/docs/features/mcp-server): Enrich agent context windows with code across your organization
<CardGroup>
<Card title="Deployment guides" icon="server" href="/docs/deployment/docker-compose" horizontal="true">
<Card title="Deployment guide" icon="server" href="/docs/deployment-guide" horizontal="true">
Learn how to self-host Sourcebot in a few simple steps.
</Card>
<Card title="Public demo" icon="globe" href="https://demo.sourcebot.dev/" horizontal="true">
@ -162,7 +161,7 @@ Sourcebot is designed to be easily self-hosted, allowing you to deploy it onto y
---
<CardGroup cols={2}>
<Card horizontal title="Deployment guides ->" href="/docs/deployment/docker-compose" />
<Card horizontal title="Deployment guide ->" href="/docs/deployment-guide" />
<Card horizontal title="Connecting your code ->" href="/docs/connections/overview" />
<Card horizontal title="Search syntax reference ->" href="/docs/features/search/syntax-reference" />
<Card horizontal title="Code navigation overview ->" href="/docs/features/code-navigation" />

View file

@ -78,7 +78,7 @@ If your deployment is dependent on these features, please [reach out](https://gi
After updating your configuration file, restart your Sourcebot deployment to pick up the new changes.
</Step>
<Step title="You're done!">
Congrats, you've successfully migrated to v3! Please let us know what you think of the new features by reaching out on our [discord](https://discord.gg/HDScTs3ptP) or on [GitHub](https://github.com/sourcebot-dev/sourcebot/issues/new/choose).
Congrats, you've successfully migrated to v3! Please let us know what you think of the new features by reaching out on our [discord](https://discord.gg/6Fhp27x7Pb) or on [GitHub](https://github.com/sourcebot-dev/sourcebot/issues/new/choose).
</Step>
</Steps>
@ -90,4 +90,4 @@ Some things to check:
- Make sure you have a name for each `connection`, and that the name only contains letters, digits, hyphens, or underscores
- Make sure each `connection` has a `type` field with a valid value (`gitlab`, `github`, `gitea`, `gerrit`)
Having troubles migrating from v2 to v3? Reach out to us on [discord](https://discord.gg/HDScTs3ptP) or [GitHub](https://github.com/sourcebot-dev/sourcebot/issues/new/choose) and we'll try our best to help
Having troubles migrating from v2 to v3? Reach out to us on [discord](https://discord.gg/6Fhp27x7Pb) or [GitHub](https://github.com/sourcebot-dev/sourcebot/issues/new/choose) and we'll try our best to help

View file

@ -40,7 +40,7 @@ Please note that the following features are no longer supported in v4:
</Step>
<Step title="You're done!">
Congrats, you've successfully migrated to v4! Please let us know what you think of the new features by reaching out on our [discord](https://discord.gg/HDScTs3ptP) or [GitHub](https://github.com/sourcebot-dev/sourcebot/issues/new/choose)
Congrats, you've successfully migrated to v4! Please let us know what you think of the new features by reaching out on our [discord](https://discord.gg/6Fhp27x7Pb) or [GitHub](https://github.com/sourcebot-dev/sourcebot/issues/new/choose)
</Step>
</Steps>
@ -58,4 +58,4 @@ to finish upgrading to v4 in single-tenant mode.
- If you're hitting issues with signing into your Sourcebot instance, make sure you're setting `AUTH_URL` correctly to your deployment domain (ex. `https://sourcebot.yourcompany.com`)
Having troubles migrating from v3 to v4? Reach out to us on [discord](https://discord.gg/HDScTs3ptP) or [GitHub](https://github.com/sourcebot-dev/sourcebot/issues/new/choose) and we'll try our best to help
Having troubles migrating from v3 to v4? Reach out to us on [discord](https://discord.gg/6Fhp27x7Pb) or [GitHub](https://github.com/sourcebot-dev/sourcebot/issues/new/choose) and we'll try our best to help

View file

@ -24,4 +24,27 @@
ghcr.io/sourcebot-dev/sourcebot:latest
```
</Tab>
<Tab title="Secret">
<Note>Secrets are only supported when [authentication](/docs/configuration/auth/overview) is enabled.</Note>
1. Navigate to **Secrets** in settings and create a new secret with your access token:
![](/images/secrets_list.png)
2. Add the `token` and `user` (username associated with the app password you created) properties to your connection config:
```json
{
"type": "bitbucket",
"deploymentType": "cloud",
"user": "myusername",
"token": {
"secret": "mysecret"
}
// .. rest of config ..
}
```
</Tab>
</Tabs>

View file

@ -22,4 +22,25 @@
ghcr.io/sourcebot-dev/sourcebot:latest
```
</Tab>
<Tab title="Secret">
<Note>Secrets are only supported when [authentication](/docs/configuration/auth/overview) is enabled.</Note>
1. Navigate to **Secrets** in settings and create a new secret with your PAT:
![](/images/secrets_list.png)
2. Add the `token` property to your connection config:
```json
{
"type": "bitbucket",
"token": {
"secret": "mysecret"
}
// .. rest of config ..
}
```
</Tab>
</Tabs>

View file

@ -77,6 +77,7 @@
"token": {
"description": "A Personal Access Token (PAT).",
"examples": [
"secret-token",
{
"env": "ENV_VAR_CONTAINING_TOKEN"
}
@ -273,6 +274,7 @@
"token": {
"description": "An authentication token.",
"examples": [
"secret-token",
{
"env": "ENV_VAR_CONTAINING_TOKEN"
}
@ -463,6 +465,7 @@
"token": {
"description": "An access token.",
"examples": [
"secret-token",
{
"env": "ENV_VAR_CONTAINING_TOKEN"
}
@ -776,6 +779,7 @@
"token": {
"description": "A Personal Access Token (PAT).",
"examples": [
"secret-token",
{
"env": "ENV_VAR_CONTAINING_TOKEN"
}
@ -972,6 +976,7 @@
"token": {
"description": "An authentication token.",
"examples": [
"secret-token",
{
"env": "ENV_VAR_CONTAINING_TOKEN"
}
@ -1162,6 +1167,7 @@
"token": {
"description": "An access token.",
"examples": [
"secret-token",
{
"env": "ENV_VAR_CONTAINING_TOKEN"
}
@ -1557,6 +1563,7 @@
"token": {
"description": "A Personal Access Token (PAT).",
"examples": [
"secret-token",
{
"env": "ENV_VAR_CONTAINING_TOKEN"
}
@ -1753,6 +1760,7 @@
"token": {
"description": "An authentication token.",
"examples": [
"secret-token",
{
"env": "ENV_VAR_CONTAINING_TOKEN"
}
@ -1943,6 +1951,7 @@
"token": {
"description": "An access token.",
"examples": [
"secret-token",
{
"env": "ENV_VAR_CONTAINING_TOKEN"
}

View file

@ -1,131 +0,0 @@
{/* THIS IS A AUTO-GENERATED FILE. DO NOT MODIFY MANUALLY! */}
```json
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "AppConfig",
"definitions": {
"GitHubAppConfig": {
"type": "object",
"properties": {
"type": {
"const": "github",
"description": "GitHub App Configuration"
},
"deploymentHostname": {
"type": "string",
"format": "hostname",
"default": "github.com",
"description": "The hostname of the GitHub App deployment.",
"examples": [
"github.com",
"github.example.com"
]
},
"id": {
"type": "string",
"description": "The ID of the GitHub App."
},
"privateKey": {
"description": "The private key of the GitHub App.",
"anyOf": [
{
"type": "object",
"properties": {
"env": {
"type": "string",
"description": "The name of the environment variable that contains the token."
}
},
"required": [
"env"
],
"additionalProperties": false
},
{
"type": "object",
"properties": {
"googleCloudSecret": {
"type": "string",
"description": "The resource name of a Google Cloud secret. Must be in the format `projects/<project-id>/secrets/<secret-name>/versions/<version-id>`. See https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets"
}
},
"required": [
"googleCloudSecret"
],
"additionalProperties": false
}
]
}
},
"required": [
"type",
"id",
"privateKey"
],
"additionalProperties": false
}
},
"oneOf": [
{
"type": "object",
"properties": {
"type": {
"const": "github",
"description": "GitHub App Configuration"
},
"deploymentHostname": {
"type": "string",
"format": "hostname",
"default": "github.com",
"description": "The hostname of the GitHub App deployment.",
"examples": [
"github.com",
"github.example.com"
]
},
"id": {
"type": "string",
"description": "The ID of the GitHub App."
},
"privateKey": {
"description": "The private key of the GitHub App.",
"anyOf": [
{
"type": "object",
"properties": {
"env": {
"type": "string",
"description": "The name of the environment variable that contains the token."
}
},
"required": [
"env"
],
"additionalProperties": false
},
{
"type": "object",
"properties": {
"googleCloudSecret": {
"type": "string",
"description": "The resource name of a Google Cloud secret. Must be in the format `projects/<project-id>/secrets/<secret-name>/versions/<version-id>`. See https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets"
}
},
"required": [
"googleCloudSecret"
],
"additionalProperties": false
}
]
}
},
"required": [
"type",
"id",
"privateKey"
],
"additionalProperties": false
}
]
}
```

View file

@ -11,30 +11,35 @@
},
"token": {
"description": "A Personal Access Token (PAT).",
"examples": [
{
"secret": "SECRET_KEY"
}
],
"anyOf": [
{
"type": "object",
"properties": {
"env": {
"secret": {
"type": "string",
"description": "The name of the environment variable that contains the token."
"description": "The name of the secret that contains the token."
}
},
"required": [
"env"
"secret"
],
"additionalProperties": false
},
{
"type": "object",
"properties": {
"googleCloudSecret": {
"env": {
"type": "string",
"description": "The resource name of a Google Cloud secret. Must be in the format `projects/<project-id>/secrets/<secret-name>/versions/<version-id>`. See https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets"
"description": "The name of the environment variable that contains the token. Only supported in declarative connection configs."
}
},
"required": [
"googleCloudSecret"
"env"
],
"additionalProperties": false
}

View file

@ -15,30 +15,35 @@
},
"token": {
"description": "An authentication token.",
"examples": [
{
"secret": "SECRET_KEY"
}
],
"anyOf": [
{
"type": "object",
"properties": {
"env": {
"secret": {
"type": "string",
"description": "The name of the environment variable that contains the token."
"description": "The name of the secret that contains the token."
}
},
"required": [
"env"
"secret"
],
"additionalProperties": false
},
{
"type": "object",
"properties": {
"googleCloudSecret": {
"env": {
"type": "string",
"description": "The resource name of a Google Cloud secret. Must be in the format `projects/<project-id>/secrets/<secret-name>/versions/<version-id>`. See https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets"
"description": "The name of the environment variable that contains the token. Only supported in declarative connection configs."
}
},
"required": [
"googleCloudSecret"
"env"
],
"additionalProperties": false
}

View file

@ -15,30 +15,35 @@
},
"token": {
"description": "A Personal Access Token (PAT).",
"examples": [
{
"secret": "SECRET_KEY"
}
],
"anyOf": [
{
"type": "object",
"properties": {
"env": {
"secret": {
"type": "string",
"description": "The name of the environment variable that contains the token."
"description": "The name of the secret that contains the token."
}
},
"required": [
"env"
"secret"
],
"additionalProperties": false
},
{
"type": "object",
"properties": {
"googleCloudSecret": {
"env": {
"type": "string",
"description": "The resource name of a Google Cloud secret. Must be in the format `projects/<project-id>/secrets/<secret-name>/versions/<version-id>`. See https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets"
"description": "The name of the environment variable that contains the token. Only supported in declarative connection configs."
}
},
"required": [
"googleCloudSecret"
"env"
],
"additionalProperties": false
}
@ -223,30 +228,35 @@
},
"token": {
"description": "An authentication token.",
"examples": [
{
"secret": "SECRET_KEY"
}
],
"anyOf": [
{
"type": "object",
"properties": {
"env": {
"secret": {
"type": "string",
"description": "The name of the environment variable that contains the token."
"description": "The name of the secret that contains the token."
}
},
"required": [
"env"
"secret"
],
"additionalProperties": false
},
{
"type": "object",
"properties": {
"googleCloudSecret": {
"env": {
"type": "string",
"description": "The resource name of a Google Cloud secret. Must be in the format `projects/<project-id>/secrets/<secret-name>/versions/<version-id>`. See https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets"
"description": "The name of the environment variable that contains the token. Only supported in declarative connection configs."
}
},
"required": [
"googleCloudSecret"
"env"
],
"additionalProperties": false
}
@ -425,30 +435,35 @@
},
"token": {
"description": "A Personal Access Token (PAT).",
"examples": [
{
"secret": "SECRET_KEY"
}
],
"anyOf": [
{
"type": "object",
"properties": {
"env": {
"secret": {
"type": "string",
"description": "The name of the environment variable that contains the token."
"description": "The name of the secret that contains the token."
}
},
"required": [
"env"
"secret"
],
"additionalProperties": false
},
{
"type": "object",
"properties": {
"googleCloudSecret": {
"env": {
"type": "string",
"description": "The resource name of a Google Cloud secret. Must be in the format `projects/<project-id>/secrets/<secret-name>/versions/<version-id>`. See https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets"
"description": "The name of the environment variable that contains the token. Only supported in declarative connection configs."
}
},
"required": [
"googleCloudSecret"
"env"
],
"additionalProperties": false
}
@ -692,30 +707,35 @@
},
"token": {
"description": "An authentication token.",
"examples": [
{
"secret": "SECRET_KEY"
}
],
"anyOf": [
{
"type": "object",
"properties": {
"env": {
"secret": {
"type": "string",
"description": "The name of the environment variable that contains the token."
"description": "The name of the secret that contains the token."
}
},
"required": [
"env"
"secret"
],
"additionalProperties": false
},
{
"type": "object",
"properties": {
"googleCloudSecret": {
"env": {
"type": "string",
"description": "The resource name of a Google Cloud secret. Must be in the format `projects/<project-id>/secrets/<secret-name>/versions/<version-id>`. See https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets"
"description": "The name of the environment variable that contains the token. Only supported in declarative connection configs."
}
},
"required": [
"googleCloudSecret"
"env"
],
"additionalProperties": false
}
@ -860,30 +880,35 @@
},
"token": {
"description": "A Personal Access Token (PAT).",
"examples": [
{
"secret": "SECRET_KEY"
}
],
"anyOf": [
{
"type": "object",
"properties": {
"env": {
"secret": {
"type": "string",
"description": "The name of the environment variable that contains the token."
"description": "The name of the secret that contains the token."
}
},
"required": [
"env"
"secret"
],
"additionalProperties": false
},
{
"type": "object",
"properties": {
"googleCloudSecret": {
"env": {
"type": "string",
"description": "The resource name of a Google Cloud secret. Must be in the format `projects/<project-id>/secrets/<secret-name>/versions/<version-id>`. See https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets"
"description": "The name of the environment variable that contains the token. Only supported in declarative connection configs."
}
},
"required": [
"googleCloudSecret"
"env"
],
"additionalProperties": false
}

View file

@ -1,115 +0,0 @@
{/* THIS IS A AUTO-GENERATED FILE. DO NOT MODIFY MANUALLY! */}
```json
{
"type": "object",
"description": "Environment variable overrides.",
"title": "EnvironmentOverrides",
"not": {
"$comment": "List of environment variables that are not allowed to be overridden.",
"anyOf": [
{
"required": [
"CONFIG_PATH"
]
}
]
},
"patternProperties": {
"^[a-zA-Z0-9_-]+$": {
"oneOf": [
{
"type": "object",
"properties": {
"type": {
"const": "token"
},
"value": {
"anyOf": [
{
"type": "object",
"properties": {
"env": {
"type": "string",
"description": "The name of the environment variable that contains the token."
}
},
"required": [
"env"
],
"additionalProperties": false
},
{
"type": "object",
"properties": {
"googleCloudSecret": {
"type": "string",
"description": "The resource name of a Google Cloud secret. Must be in the format `projects/<project-id>/secrets/<secret-name>/versions/<version-id>`. See https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets"
}
},
"required": [
"googleCloudSecret"
],
"additionalProperties": false
}
]
}
},
"required": [
"type",
"value"
],
"additionalProperties": false
},
{
"type": "object",
"properties": {
"type": {
"const": "string"
},
"value": {
"type": "string"
}
},
"required": [
"type",
"value"
],
"additionalProperties": false
},
{
"type": "object",
"properties": {
"type": {
"const": "number"
},
"value": {
"type": "number"
}
},
"required": [
"type",
"value"
],
"additionalProperties": false
},
{
"type": "object",
"properties": {
"type": {
"const": "boolean"
},
"value": {
"type": "boolean"
}
},
"required": [
"type",
"value"
],
"additionalProperties": false
}
]
}
}
}
```

View file

@ -11,30 +11,35 @@
},
"token": {
"description": "A Personal Access Token (PAT).",
"examples": [
{
"secret": "SECRET_KEY"
}
],
"anyOf": [
{
"type": "object",
"properties": {
"env": {
"secret": {
"type": "string",
"description": "The name of the environment variable that contains the token."
"description": "The name of the secret that contains the token."
}
},
"required": [
"env"
"secret"
],
"additionalProperties": false
},
{
"type": "object",
"properties": {
"googleCloudSecret": {
"env": {
"type": "string",
"description": "The resource name of a Google Cloud secret. Must be in the format `projects/<project-id>/secrets/<secret-name>/versions/<version-id>`. See https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets"
"description": "The name of the environment variable that contains the token. Only supported in declarative connection configs."
}
},
"required": [
"googleCloudSecret"
"env"
],
"additionalProperties": false
}

View file

@ -11,30 +11,35 @@
},
"token": {
"description": "A Personal Access Token (PAT).",
"examples": [
{
"secret": "SECRET_KEY"
}
],
"anyOf": [
{
"type": "object",
"properties": {
"env": {
"secret": {
"type": "string",
"description": "The name of the environment variable that contains the token."
"description": "The name of the secret that contains the token."
}
},
"required": [
"env"
"secret"
],
"additionalProperties": false
},
{
"type": "object",
"properties": {
"googleCloudSecret": {
"env": {
"type": "string",
"description": "The resource name of a Google Cloud secret. Must be in the format `projects/<project-id>/secrets/<secret-name>/versions/<version-id>`. See https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets"
"description": "The name of the environment variable that contains the token. Only supported in declarative connection configs."
}
},
"required": [
"googleCloudSecret"
"env"
],
"additionalProperties": false
}

View file

@ -11,30 +11,35 @@
},
"token": {
"description": "An authentication token.",
"examples": [
{
"secret": "SECRET_KEY"
}
],
"anyOf": [
{
"type": "object",
"properties": {
"env": {
"secret": {
"type": "string",
"description": "The name of the environment variable that contains the token."
"description": "The name of the secret that contains the token."
}
},
"required": [
"env"
"secret"
],
"additionalProperties": false
},
{
"type": "object",
"properties": {
"googleCloudSecret": {
"env": {
"type": "string",
"description": "The resource name of a Google Cloud secret. Must be in the format `projects/<project-id>/secrets/<secret-name>/versions/<version-id>`. See https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets"
"description": "The name of the environment variable that contains the token. Only supported in declarative connection configs."
}
},
"required": [
"googleCloudSecret"
"env"
],
"additionalProperties": false
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -9,26 +9,26 @@
{
"type": "object",
"properties": {
"env": {
"secret": {
"type": "string",
"description": "The name of the environment variable that contains the token."
"description": "The name of the secret that contains the token."
}
},
"required": [
"env"
"secret"
],
"additionalProperties": false
},
{
"type": "object",
"properties": {
"googleCloudSecret": {
"env": {
"type": "string",
"description": "The resource name of a Google Cloud secret. Must be in the format `projects/<project-id>/secrets/<secret-name>/versions/<version-id>`. See https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets"
"description": "The name of the environment variable that contains the token. Only supported in declarative connection configs."
}
},
"required": [
"googleCloudSecret"
"env"
],
"additionalProperties": false
}
@ -89,26 +89,26 @@
{
"type": "object",
"properties": {
"env": {
"secret": {
"type": "string",
"description": "The name of the environment variable that contains the token."
"description": "The name of the secret that contains the token."
}
},
"required": [
"env"
"secret"
],
"additionalProperties": false
},
{
"type": "object",
"properties": {
"googleCloudSecret": {
"env": {
"type": "string",
"description": "The resource name of a Google Cloud secret. Must be in the format `projects/<project-id>/secrets/<secret-name>/versions/<version-id>`. See https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets"
"description": "The name of the environment variable that contains the token. Only supported in declarative connection configs."
}
},
"required": [
"googleCloudSecret"
"env"
],
"additionalProperties": false
}
@ -133,26 +133,26 @@
{
"type": "object",
"properties": {
"env": {
"secret": {
"type": "string",
"description": "The name of the environment variable that contains the token."
"description": "The name of the secret that contains the token."
}
},
"required": [
"env"
"secret"
],
"additionalProperties": false
},
{
"type": "object",
"properties": {
"googleCloudSecret": {
"env": {
"type": "string",
"description": "The resource name of a Google Cloud secret. Must be in the format `projects/<project-id>/secrets/<secret-name>/versions/<version-id>`. See https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets"
"description": "The name of the environment variable that contains the token. Only supported in declarative connection configs."
}
},
"required": [
"googleCloudSecret"
"env"
],
"additionalProperties": false
}

View file

@ -1,71 +1,18 @@
#!/bin/sh
# Exit immediately if a command fails
set -e
# Disable auto-exporting of variables
set +a
# Detect if running as root
IS_ROOT=false
if [ "$(id -u)" -eq 0 ]; then
IS_ROOT=true
if [ "$DATABASE_URL" = "postgresql://postgres@localhost:5432/sourcebot" ]; then
DATABASE_EMBEDDED="true"
fi
if [ "$IS_ROOT" = "true" ]; then
echo -e "\e[34m[Info] Running as root user.\e[0m"
else
echo -e "\e[34m[Info] Running as non-root user.\e[0m"
fi
# If a CONFIG_PATH is set, resolve the environment overrides from the config file.
# The overrides will be written into variables scopped to the current shell. This is
# required in case one of the variables used in this entrypoint is overriden (e.g.,
# DATABASE_URL, REDIS_URL, etc.)
if [ -n "$CONFIG_PATH" ]; then
echo -e "\e[34m[Info] Resolving environment overrides from $CONFIG_PATH...\e[0m"
set +e # Disable exist on error so we can capture EXIT_CODE
OVERRIDES_OUTPUT=$(SKIP_ENV_VALIDATION=1 yarn tool:resolve-env-overrides 2>&1)
EXIT_CODE=$?
set -e # Re-enable exit on error
if [ $EXIT_CODE -eq 0 ]; then
eval "$OVERRIDES_OUTPUT"
else
echo -e "\e[31m[Error] Failed to resolve environment overrides.\e[0m"
echo "$OVERRIDES_OUTPUT"
exit 1
fi
fi
# Descontruct the database URL from the individual variables if DATABASE_URL is not set
if [ -z "$DATABASE_URL" ] && [ -n "$DATABASE_HOST" ] && [ -n "$DATABASE_USERNAME" ] && [ -n "$DATABASE_PASSWORD" ] && [ -n "$DATABASE_NAME" ]; then
DATABASE_URL="postgresql://${DATABASE_USERNAME}:${DATABASE_PASSWORD}@${DATABASE_HOST}/${DATABASE_NAME}"
if [ -n "$DATABASE_ARGS" ]; then
DATABASE_URL="${DATABASE_URL}?$DATABASE_ARGS"
fi
fi
if [ -z "$DATABASE_URL" ]; then
echo -e "\e[34m[Info] DATABASE_URL is not set. Using embeded database.\e[0m"
export DATABASE_EMBEDDED="true"
export DATABASE_URL="postgresql://postgres@localhost:5432/sourcebot"
else
export DATABASE_EMBEDDED="false"
fi
if [ -z "$REDIS_URL" ]; then
echo -e "\e[34m[Info] REDIS_URL is not set. Using embeded redis.\e[0m"
export REDIS_EMBEDDED="true"
export REDIS_URL="redis://localhost:6379"
else
export REDIS_EMBEDDED="false"
fi
echo -e "\e[34m[Info] Sourcebot version: $NEXT_PUBLIC_SOURCEBOT_VERSION\e[0m"
# If we don't have a PostHog key, then we need to disable telemetry.
if [ -z "$NEXT_PUBLIC_POSTHOG_PAPIK" ]; then
echo -e "\e[33m[Warning] NEXT_PUBLIC_POSTHOG_PAPIK was not set. Setting SOURCEBOT_TELEMETRY_DISABLED.\e[0m"
export SOURCEBOT_TELEMETRY_DISABLED=true
fi
if [ -n "$SOURCEBOT_TELEMETRY_DISABLED" ]; then
# Validate that SOURCEBOT_TELEMETRY_DISABLED is either "true" or "false"
if [ "$SOURCEBOT_TELEMETRY_DISABLED" != "true" ] && [ "$SOURCEBOT_TELEMETRY_DISABLED" != "false" ]; then
@ -89,17 +36,12 @@ fi
# Check if DATABASE_DATA_DIR exists, if not initialize it
if [ "$DATABASE_EMBEDDED" = "true" ] && [ ! -d "$DATABASE_DATA_DIR" ]; then
echo -e "\e[34m[Info] Initializing database at $DATABASE_DATA_DIR...\e[0m"
mkdir -p $DATABASE_DATA_DIR
if [ "$IS_ROOT" = "true" ]; then
chown -R postgres:postgres "$DATABASE_DATA_DIR"
su postgres -c "initdb -D $DATABASE_DATA_DIR"
else
initdb -D "$DATABASE_DATA_DIR" -U postgres
fi
mkdir -p $DATABASE_DATA_DIR && chown -R postgres:postgres "$DATABASE_DATA_DIR"
su postgres -c "initdb -D $DATABASE_DATA_DIR"
fi
# Create the redis data directory if it doesn't exist
if [ "$REDIS_EMBEDDED" = "true" ] && [ ! -d "$REDIS_DATA_DIR" ]; then
if [ ! -d "$REDIS_DATA_DIR" ]; then
mkdir -p $REDIS_DATA_DIR
fi
@ -153,7 +95,7 @@ if [ ! -f "$FIRST_RUN_FILE" ]; then
# (if telemetry is enabled)
if [ "$SOURCEBOT_TELEMETRY_DISABLED" = "false" ]; then
if ! ( curl -L --output /dev/null --silent --fail --header "Content-Type: application/json" -d '{
"api_key": "'"$POSTHOG_PAPIK"'",
"api_key": "'"$NEXT_PUBLIC_POSTHOG_PAPIK"'",
"event": "install",
"distinct_id": "'"$SOURCEBOT_INSTALL_ID"'",
"properties": {
@ -173,7 +115,7 @@ else
if [ "$SOURCEBOT_TELEMETRY_DISABLED" = "false" ]; then
if ! ( curl -L --output /dev/null --silent --fail --header "Content-Type: application/json" -d '{
"api_key": "'"$POSTHOG_PAPIK"'",
"api_key": "'"$NEXT_PUBLIC_POSTHOG_PAPIK"'",
"event": "upgrade",
"distinct_id": "'"$SOURCEBOT_INSTALL_ID"'",
"properties": {
@ -189,33 +131,16 @@ fi
echo "{\"version\": \"$NEXT_PUBLIC_SOURCEBOT_VERSION\", \"install_id\": \"$SOURCEBOT_INSTALL_ID\"}" > "$FIRST_RUN_FILE"
# Start the database and wait for it to be ready before starting any other service
if [ "$DATABASE_EMBEDDED" = "true" ]; then
if [ "$IS_ROOT" = "true" ]; then
su postgres -c "postgres -D $DATABASE_DATA_DIR" &
else
postgres -D "$DATABASE_DATA_DIR" &
fi
su postgres -c "postgres -D $DATABASE_DATA_DIR" &
until pg_isready -h localhost -p 5432 -U postgres; do
echo -e "\e[34m[Info] Waiting for the database to be ready...\e[0m"
sleep 1
# As postgres runs in the background, we must check if it is still
# running, otherwise the "until" loop will be running indefinitely.
if ! pgrep -x "postgres" > /dev/null; then
echo "postgres failed to run"
exit 1
fi
done
if [ "$IS_ROOT" = "false" ]; then
# Running as non-root we need to ensure the postgres account is created.
psql -U postgres -tc "SELECT 1 FROM pg_roles WHERE rolname='postgres'" | grep -q 1 \
|| createuser postgres -s
fi
# Check if the database already exists, and create it if it doesn't exist
# Check if the database already exists, and create it if it dne
EXISTING_DB=$(psql -U postgres -tAc "SELECT 1 FROM pg_database WHERE datname = 'sourcebot'")
if [ "$EXISTING_DB" = "1" ]; then
@ -228,9 +153,9 @@ fi
# Run a Database migration
echo -e "\e[34m[Info] Running database migration...\e[0m"
DATABASE_URL="$DATABASE_URL" yarn workspace @sourcebot/db prisma:migrate:prod
yarn workspace @sourcebot/db prisma:migrate:prod
# Create the log directory if it doesn't exist
# Create the log directory
mkdir -p /var/log/sourcebot
# Run supervisord

View file

@ -4,9 +4,9 @@
"packages/*"
],
"scripts": {
"build": "cross-env SKIP_ENV_VALIDATION=1 yarn workspaces foreach --all --topological run build",
"test": "yarn workspaces foreach --all --topological run test",
"dev": "concurrently --kill-others --names \"zoekt,worker,web,mcp,schemas\" 'yarn dev:zoekt' 'yarn dev:backend' 'yarn dev:web' 'yarn watch:mcp' 'yarn watch:schemas'",
"build": "cross-env SKIP_ENV_VALIDATION=1 yarn workspaces foreach -A run build",
"test": "yarn workspaces foreach -A run test",
"dev": "yarn dev:prisma:migrate:dev && npm-run-all --print-label --parallel dev:zoekt dev:backend dev:web watch:mcp watch:schemas",
"with-env": "cross-env PATH=\"$PWD/bin:$PATH\" dotenv -e .env.development -c --",
"dev:zoekt": "yarn with-env zoekt-webserver -index .sourcebot/index -rpc",
"dev:backend": "yarn with-env yarn workspace @sourcebot/backend dev:watch",
@ -18,16 +18,15 @@
"dev:prisma:studio": "yarn with-env yarn workspace @sourcebot/db prisma:studio",
"dev:prisma:migrate:reset": "yarn with-env yarn workspace @sourcebot/db prisma:migrate:reset",
"dev:prisma:db:push": "yarn with-env yarn workspace @sourcebot/db prisma:db:push",
"build:deps": "yarn workspaces foreach --recursive --topological --from '{@sourcebot/schemas,@sourcebot/db,@sourcebot/shared,@sourcebot/query-language}' run build"
"build:deps": "yarn workspaces foreach -R --from '{@sourcebot/schemas,@sourcebot/error,@sourcebot/crypto,@sourcebot/db,@sourcebot/shared}' run build"
},
"devDependencies": {
"concurrently": "^9.2.1",
"cross-env": "^7.0.3",
"dotenv-cli": "^8.0.0"
"dotenv-cli": "^8.0.0",
"npm-run-all": "^4.1.5"
},
"packageManager": "yarn@4.7.0",
"resolutions": {
"prettier": "3.5.3",
"@lezer/common": "1.3.0"
"prettier": "3.5.3"
}
}

View file

@ -24,35 +24,33 @@
"dependencies": {
"@coderabbitai/bitbucket": "^1.1.3",
"@gitbeaker/rest": "^40.5.1",
"@octokit/app": "^16.1.1",
"@octokit/rest": "^21.0.2",
"@sentry/cli": "^2.42.2",
"@sentry/node": "^9.3.0",
"@sentry/profiling-node": "^9.3.0",
"@sourcebot/crypto": "workspace:*",
"@sourcebot/db": "workspace:*",
"@sourcebot/error": "workspace:*",
"@sourcebot/logger": "workspace:*",
"@sourcebot/schemas": "workspace:*",
"@sourcebot/shared": "workspace:*",
"@t3-oss/env-core": "^0.12.0",
"@types/express": "^5.0.0",
"argparse": "^2.0.1",
"azure-devops-node-api": "^15.1.1",
"bullmq": "^5.34.10",
"chokidar": "^4.0.3",
"cross-fetch": "^4.0.0",
"dotenv": "^16.4.5",
"express": "^4.21.2",
"express-async-errors": "^3.1.1",
"fast-deep-equal": "^3.1.3",
"git-url-parse": "^16.1.0",
"gitea-js": "^1.22.0",
"glob": "^11.0.0",
"groupmq": "^1.0.0",
"ioredis": "^5.4.2",
"lowdb": "^7.0.1",
"micromatch": "^4.0.8",
"p-limit": "^7.2.0",
"posthog-node": "^4.2.1",
"prom-client": "^15.1.3",
"simple-git": "^3.27.0",
"zod": "^3.25.74"
"zod": "^3.24.3"
}
}

View file

@ -1,103 +0,0 @@
import { PrismaClient, RepoIndexingJobType } from '@sourcebot/db';
import { createLogger } from '@sourcebot/shared';
import express, { Request, Response } from 'express';
import 'express-async-errors';
import * as http from "http";
import z from 'zod';
import { ConnectionManager } from './connectionManager.js';
import { PromClient } from './promClient.js';
import { RepoIndexManager } from './repoIndexManager.js';
const logger = createLogger('api');
const PORT = 3060;
export class Api {
private server: http.Server;
constructor(
promClient: PromClient,
private prisma: PrismaClient,
private connectionManager: ConnectionManager,
private repoIndexManager: RepoIndexManager,
) {
const app = express();
app.use(express.json());
app.use(express.urlencoded({ extended: true }));
// Prometheus metrics endpoint
app.use('/metrics', async (_req: Request, res: Response) => {
res.set('Content-Type', promClient.registry.contentType);
const metrics = await promClient.registry.metrics();
res.end(metrics);
});
app.post('/api/sync-connection', this.syncConnection.bind(this));
app.post('/api/index-repo', this.indexRepo.bind(this));
this.server = app.listen(PORT, () => {
logger.info(`API server is running on port ${PORT}`);
});
}
private async syncConnection(req: Request, res: Response) {
const schema = z.object({
connectionId: z.number(),
}).strict();
const parsed = schema.safeParse(req.body);
if (!parsed.success) {
res.status(400).json({ error: parsed.error.message });
return;
}
const { connectionId } = parsed.data;
const connection = await this.prisma.connection.findUnique({
where: {
id: connectionId,
}
});
if (!connection) {
res.status(404).json({ error: 'Connection not found' });
return;
}
const [jobId] = await this.connectionManager.createJobs([connection]);
res.status(200).json({ jobId });
}
private async indexRepo(req: Request, res: Response) {
const schema = z.object({
repoId: z.number(),
}).strict();
const parsed = schema.safeParse(req.body);
if (!parsed.success) {
res.status(400).json({ error: parsed.error.message });
return;
}
const { repoId } = parsed.data;
const repo = await this.prisma.repo.findUnique({
where: { id: repoId },
});
if (!repo) {
res.status(404).json({ error: 'Repo not found' });
return;
}
const [jobId] = await this.repoIndexManager.createJobs([repo], RepoIndexingJobType.INDEX);
res.status(200).json({ jobId });
}
public async dispose() {
return new Promise<void>((resolve, reject) => {
this.server.close((err) => {
if (err) reject(err);
else resolve(undefined);
});
});
}
}

View file

@ -1,12 +1,13 @@
import { AzureDevOpsConnectionConfig } from "@sourcebot/schemas/v3/azuredevops.type";
import { createLogger } from "@sourcebot/shared";
import { measure, fetchWithRetry } from "./utils.js";
import { createLogger } from "@sourcebot/logger";
import { getTokenFromConfig, measure, fetchWithRetry } from "./utils.js";
import micromatch from "micromatch";
import { PrismaClient } from "@sourcebot/db";
import { BackendException, BackendError } from "@sourcebot/error";
import { processPromiseResults, throwIfAnyFailed } from "./connectionUtils.js";
import * as Sentry from "@sentry/node";
import * as azdev from "azure-devops-node-api";
import { GitRepository } from "azure-devops-node-api/interfaces/GitInterfaces.js";
import { getTokenFromConfig } from "@sourcebot/shared";
const logger = createLogger('azuredevops');
const AZUREDEVOPS_CLOUD_HOSTNAME = "dev.azure.com";
@ -27,54 +28,66 @@ function createAzureDevOpsConnection(
export const getAzureDevOpsReposFromConfig = async (
config: AzureDevOpsConnectionConfig,
orgId: number,
db: PrismaClient
) => {
const baseUrl = config.url || `https://${AZUREDEVOPS_CLOUD_HOSTNAME}`;
const token = config.token ?
await getTokenFromConfig(config.token) :
await getTokenFromConfig(config.token, orgId, db, logger) :
undefined;
if (!token) {
const e = new Error('Azure DevOps requires a Personal Access Token');
const e = new BackendException(BackendError.CONNECTION_SYNC_INVALID_TOKEN, {
message: 'Azure DevOps requires a Personal Access Token',
});
Sentry.captureException(e);
throw e;
}
const useTfsPath = config.useTfsPath || false;
let allRepos: GitRepository[] = [];
let allWarnings: string[] = [];
let notFound: {
users: string[],
orgs: string[],
repos: string[],
} = {
users: [],
orgs: [],
repos: [],
};
if (config.orgs) {
const { repos, warnings } = await getReposForOrganizations(
const { validRepos, notFoundOrgs } = await getReposForOrganizations(
config.orgs,
baseUrl,
token,
useTfsPath
);
allRepos = allRepos.concat(repos);
allWarnings = allWarnings.concat(warnings);
allRepos = allRepos.concat(validRepos);
notFound.orgs = notFoundOrgs;
}
if (config.projects) {
const { repos, warnings } = await getReposForProjects(
const { validRepos, notFoundProjects } = await getReposForProjects(
config.projects,
baseUrl,
token,
useTfsPath
);
allRepos = allRepos.concat(repos);
allWarnings = allWarnings.concat(warnings);
allRepos = allRepos.concat(validRepos);
notFound.repos = notFound.repos.concat(notFoundProjects);
}
if (config.repos) {
const { repos, warnings } = await getRepos(
const { validRepos, notFoundRepos } = await getRepos(
config.repos,
baseUrl,
token,
useTfsPath
);
allRepos = allRepos.concat(repos);
allWarnings = allWarnings.concat(warnings);
allRepos = allRepos.concat(validRepos);
notFound.repos = notFound.repos.concat(notFoundRepos);
}
let repos = allRepos
@ -90,8 +103,8 @@ export const getAzureDevOpsReposFromConfig = async (
logger.debug(`Found ${repos.length} total repositories.`);
return {
repos,
warnings: allWarnings,
validRepos: repos,
notFound,
};
};
@ -208,11 +221,10 @@ async function getReposForOrganizations(
// Check if it's a 404-like error (organization not found)
if (error && typeof error === 'object' && 'statusCode' in error && error.statusCode === 404) {
const warning = `Organization ${org} not found or no access`;
logger.warn(warning);
logger.error(`Organization ${org} not found or no access`);
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: org
};
}
throw error;
@ -220,11 +232,11 @@ async function getReposForOrganizations(
}));
throwIfAnyFailed(results);
const { validItems: repos, warnings } = processPromiseResults<GitRepository>(results);
const { validItems: validRepos, notFoundItems: notFoundOrgs } = processPromiseResults<GitRepository>(results);
return {
repos,
warnings,
validRepos,
notFoundOrgs,
};
}
@ -262,11 +274,10 @@ async function getReposForProjects(
logger.error(`Failed to fetch repositories for project ${project}.`, error);
if (error && typeof error === 'object' && 'statusCode' in error && error.statusCode === 404) {
const warning = `Project ${project} not found or no access`;
logger.warn(warning);
logger.error(`Project ${project} not found or no access`);
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: project
};
}
throw error;
@ -274,11 +285,11 @@ async function getReposForProjects(
}));
throwIfAnyFailed(results);
const { validItems: repos, warnings } = processPromiseResults<GitRepository>(results);
const { validItems: validRepos, notFoundItems: notFoundProjects } = processPromiseResults<GitRepository>(results);
return {
repos,
warnings,
validRepos,
notFoundProjects,
};
}
@ -317,11 +328,10 @@ async function getRepos(
logger.error(`Failed to fetch repository ${repo}.`, error);
if (error && typeof error === 'object' && 'statusCode' in error && error.statusCode === 404) {
const warning = `Repository ${repo} not found or no access`;
logger.warn(warning);
logger.error(`Repository ${repo} not found or no access`);
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: repo
};
}
throw error;
@ -329,10 +339,10 @@ async function getRepos(
}));
throwIfAnyFailed(results);
const { validItems: repos, warnings } = processPromiseResults<GitRepository>(results);
const { validItems: validRepos, notFoundItems: notFoundRepos } = processPromiseResults<GitRepository>(results);
return {
repos,
warnings,
validRepos,
notFoundRepos,
};
}

View file

@ -2,17 +2,16 @@ import { createBitbucketCloudClient } from "@coderabbitai/bitbucket/cloud";
import { createBitbucketServerClient } from "@coderabbitai/bitbucket/server";
import { BitbucketConnectionConfig } from "@sourcebot/schemas/v3/bitbucket.type";
import type { ClientOptions, ClientPathsWithMethod } from "openapi-fetch";
import { createLogger } from "@sourcebot/shared";
import { measure, fetchWithRetry } from "./utils.js";
import { createLogger } from "@sourcebot/logger";
import { PrismaClient } from "@sourcebot/db";
import { getTokenFromConfig, measure, fetchWithRetry } from "./utils.js";
import * as Sentry from "@sentry/node";
import micromatch from "micromatch";
import {
SchemaRepository as CloudRepository,
} from "@coderabbitai/bitbucket/cloud/openapi";
import { SchemaRestRepository as ServerRepository } from "@coderabbitai/bitbucket/server/openapi";
import { processPromiseResults } from "./connectionUtils.js";
import { throwIfAnyFailed } from "./connectionUtils.js";
import { getTokenFromConfig } from "@sourcebot/shared";
const logger = createLogger('bitbucket');
const BITBUCKET_CLOUD_GIT = 'https://bitbucket.org';
@ -28,9 +27,9 @@ interface BitbucketClient {
apiClient: any;
baseUrl: string;
gitUrl: string;
getReposForWorkspace: (client: BitbucketClient, workspaces: string[]) => Promise<{repos: BitbucketRepository[], warnings: string[]}>;
getReposForProjects: (client: BitbucketClient, projects: string[]) => Promise<{repos: BitbucketRepository[], warnings: string[]}>;
getRepos: (client: BitbucketClient, repos: string[]) => Promise<{repos: BitbucketRepository[], warnings: string[]}>;
getReposForWorkspace: (client: BitbucketClient, workspaces: string[]) => Promise<{validRepos: BitbucketRepository[], notFoundWorkspaces: string[]}>;
getReposForProjects: (client: BitbucketClient, projects: string[]) => Promise<{validRepos: BitbucketRepository[], notFoundProjects: string[]}>;
getRepos: (client: BitbucketClient, repos: string[]) => Promise<{validRepos: BitbucketRepository[], notFoundRepos: string[]}>;
shouldExcludeRepo: (repo: BitbucketRepository, config: BitbucketConnectionConfig) => boolean;
}
@ -58,9 +57,9 @@ type ServerPaginatedResponse<T> = {
readonly nextPageStart: number;
}
export const getBitbucketReposFromConfig = async (config: BitbucketConnectionConfig) => {
export const getBitbucketReposFromConfig = async (config: BitbucketConnectionConfig, orgId: number, db: PrismaClient) => {
const token = config.token ?
await getTokenFromConfig(config.token) :
await getTokenFromConfig(config.token, orgId, db, logger) :
undefined;
if (config.deploymentType === 'server' && !config.url) {
@ -72,24 +71,32 @@ export const getBitbucketReposFromConfig = async (config: BitbucketConnectionCon
cloudClient(config.user, token);
let allRepos: BitbucketRepository[] = [];
let allWarnings: string[] = [];
let notFound: {
orgs: string[],
users: string[],
repos: string[],
} = {
orgs: [],
users: [],
repos: [],
};
if (config.workspaces) {
const { repos, warnings } = await client.getReposForWorkspace(client, config.workspaces);
allRepos = allRepos.concat(repos);
allWarnings = allWarnings.concat(warnings);
const { validRepos, notFoundWorkspaces } = await client.getReposForWorkspace(client, config.workspaces);
allRepos = allRepos.concat(validRepos);
notFound.orgs = notFoundWorkspaces;
}
if (config.projects) {
const { repos, warnings } = await client.getReposForProjects(client, config.projects);
allRepos = allRepos.concat(repos);
allWarnings = allWarnings.concat(warnings);
const { validRepos, notFoundProjects } = await client.getReposForProjects(client, config.projects);
allRepos = allRepos.concat(validRepos);
notFound.orgs = notFoundProjects;
}
if (config.repos) {
const { repos, warnings } = await client.getRepos(client, config.repos);
allRepos = allRepos.concat(repos);
allWarnings = allWarnings.concat(warnings);
const { validRepos, notFoundRepos } = await client.getRepos(client, config.repos);
allRepos = allRepos.concat(validRepos);
notFound.repos = notFoundRepos;
}
const filteredRepos = allRepos.filter((repo) => {
@ -97,8 +104,8 @@ export const getBitbucketReposFromConfig = async (config: BitbucketConnectionCon
});
return {
repos: filteredRepos,
warnings: allWarnings,
validRepos: filteredRepos,
notFound,
};
}
@ -179,7 +186,7 @@ function parseUrl(url: string): { path: string; query: Record<string, string>; }
}
async function cloudGetReposForWorkspace(client: BitbucketClient, workspaces: string[]): Promise<{repos: CloudRepository[], warnings: string[]}> {
async function cloudGetReposForWorkspace(client: BitbucketClient, workspaces: string[]): Promise<{validRepos: CloudRepository[], notFoundWorkspaces: string[]}> {
const results = await Promise.allSettled(workspaces.map(async (workspace) => {
try {
logger.debug(`Fetching all repos for workspace ${workspace}...`);
@ -214,11 +221,10 @@ async function cloudGetReposForWorkspace(client: BitbucketClient, workspaces: st
const status = e?.cause?.response?.status;
if (status == 404) {
const warning = `Workspace ${workspace} not found or invalid access`;
logger.warn(warning);
logger.error(`Workspace ${workspace} not found or invalid access`)
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: workspace
}
}
throw e;
@ -226,22 +232,21 @@ async function cloudGetReposForWorkspace(client: BitbucketClient, workspaces: st
}));
throwIfAnyFailed(results);
const { validItems: repos, warnings } = processPromiseResults(results);
const { validItems: validRepos, notFoundItems: notFoundWorkspaces } = processPromiseResults(results);
return {
repos,
warnings,
validRepos,
notFoundWorkspaces,
};
}
async function cloudGetReposForProjects(client: BitbucketClient, projects: string[]): Promise<{repos: CloudRepository[], warnings: string[]}> {
async function cloudGetReposForProjects(client: BitbucketClient, projects: string[]): Promise<{validRepos: CloudRepository[], notFoundProjects: string[]}> {
const results = await Promise.allSettled(projects.map(async (project) => {
const [workspace, project_name] = project.split('/');
if (!workspace || !project_name) {
const warning = `Invalid project ${project}`;
logger.warn(warning);
logger.error(`Invalid project ${project}`);
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: project
}
}
@ -277,11 +282,10 @@ async function cloudGetReposForProjects(client: BitbucketClient, projects: strin
const status = e?.cause?.response?.status;
if (status == 404) {
const warning = `Project ${project_name} not found in ${workspace} or invalid access`;
logger.warn(warning);
logger.error(`Project ${project_name} not found in ${workspace} or invalid access`)
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: project
}
}
throw e;
@ -289,22 +293,21 @@ async function cloudGetReposForProjects(client: BitbucketClient, projects: strin
}));
throwIfAnyFailed(results);
const { validItems: repos, warnings } = processPromiseResults(results);
const { validItems: validRepos, notFoundItems: notFoundProjects } = processPromiseResults(results);
return {
repos,
warnings
validRepos,
notFoundProjects
}
}
async function cloudGetRepos(client: BitbucketClient, repoList: string[]): Promise<{repos: CloudRepository[], warnings: string[]}> {
const results = await Promise.allSettled(repoList.map(async (repo) => {
async function cloudGetRepos(client: BitbucketClient, repos: string[]): Promise<{validRepos: CloudRepository[], notFoundRepos: string[]}> {
const results = await Promise.allSettled(repos.map(async (repo) => {
const [workspace, repo_slug] = repo.split('/');
if (!workspace || !repo_slug) {
const warning = `Invalid repo ${repo}`;
logger.warn(warning);
logger.error(`Invalid repo ${repo}`);
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: repo
};
}
@ -326,11 +329,10 @@ async function cloudGetRepos(client: BitbucketClient, repoList: string[]): Promi
const status = e?.cause?.response?.status;
if (status === 404) {
const warning = `Repo ${repo} not found in ${workspace} or invalid access`;
logger.warn(warning);
logger.error(`Repo ${repo} not found in ${workspace} or invalid access`);
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: repo
};
}
throw e;
@ -338,24 +340,19 @@ async function cloudGetRepos(client: BitbucketClient, repoList: string[]): Promi
}));
throwIfAnyFailed(results);
const { validItems: repos, warnings } = processPromiseResults(results);
const { validItems: validRepos, notFoundItems: notFoundRepos } = processPromiseResults(results);
return {
repos,
warnings
validRepos,
notFoundRepos
};
}
function cloudShouldExcludeRepo(repo: BitbucketRepository, config: BitbucketConnectionConfig): boolean {
const cloudRepo = repo as CloudRepository;
let reason = '';
const repoName = cloudRepo.full_name!;
const shouldExclude = (() => {
if (config.exclude?.repos) {
if (micromatch.isMatch(repoName, config.exclude.repos)) {
reason = `\`exclude.repos\` contains ${repoName}`;
return true;
}
if (config.exclude?.repos && config.exclude.repos.includes(cloudRepo.full_name!)) {
return true;
}
if (!!config.exclude?.archived) {
@ -363,15 +360,12 @@ function cloudShouldExcludeRepo(repo: BitbucketRepository, config: BitbucketConn
}
if (!!config.exclude?.forks && cloudRepo.parent !== undefined) {
reason = `\`exclude.forks\` is true`;
return true;
}
return false;
})();
if (shouldExclude) {
logger.debug(`Excluding repo ${repoName}. Reason: ${reason}`);
logger.debug(`Excluding repo ${cloudRepo.full_name} because it matches the exclude pattern`);
return true;
}
return false;
@ -440,16 +434,15 @@ const getPaginatedServer = async <T>(
return results;
}
async function serverGetReposForWorkspace(client: BitbucketClient, workspaces: string[]): Promise<{repos: ServerRepository[], warnings: string[]}> {
const warnings = workspaces.map(workspace => `Workspaces are not supported in Bitbucket Server: ${workspace}`);
async function serverGetReposForWorkspace(client: BitbucketClient, workspaces: string[]): Promise<{validRepos: ServerRepository[], notFoundWorkspaces: string[]}> {
logger.debug('Workspaces are not supported in Bitbucket Server');
return {
repos: [],
warnings
validRepos: [],
notFoundWorkspaces: workspaces
};
}
async function serverGetReposForProjects(client: BitbucketClient, projects: string[]): Promise<{repos: ServerRepository[], warnings: string[]}> {
async function serverGetReposForProjects(client: BitbucketClient, projects: string[]): Promise<{validRepos: ServerRepository[], notFoundProjects: string[]}> {
const results = await Promise.allSettled(projects.map(async (project) => {
try {
logger.debug(`Fetching all repos for project ${project}...`);
@ -484,11 +477,10 @@ async function serverGetReposForProjects(client: BitbucketClient, projects: stri
const status = e?.cause?.response?.status;
if (status == 404) {
const warning = `Project ${project} not found or invalid access`;
logger.warn(warning);
logger.error(`Project ${project} not found or invalid access`);
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: project
};
}
throw e;
@ -496,22 +488,21 @@ async function serverGetReposForProjects(client: BitbucketClient, projects: stri
}));
throwIfAnyFailed(results);
const { validItems: repos, warnings } = processPromiseResults(results);
const { validItems: validRepos, notFoundItems: notFoundProjects } = processPromiseResults(results);
return {
repos,
warnings
validRepos,
notFoundProjects
};
}
async function serverGetRepos(client: BitbucketClient, repoList: string[]): Promise<{repos: ServerRepository[], warnings: string[]}> {
const results = await Promise.allSettled(repoList.map(async (repo) => {
async function serverGetRepos(client: BitbucketClient, repos: string[]): Promise<{validRepos: ServerRepository[], notFoundRepos: string[]}> {
const results = await Promise.allSettled(repos.map(async (repo) => {
const [project, repo_slug] = repo.split('/');
if (!project || !repo_slug) {
const warning = `Invalid repo ${repo}`;
logger.warn(warning);
logger.error(`Invalid repo ${repo}`);
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: repo
};
}
@ -533,11 +524,10 @@ async function serverGetRepos(client: BitbucketClient, repoList: string[]): Prom
const status = e?.cause?.response?.status;
if (status === 404) {
const warning = `Repo ${repo} not found in project ${project} or invalid access`;
logger.warn(warning);
logger.error(`Repo ${repo} not found in project ${project} or invalid access`);
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: repo
};
}
throw e;
@ -545,10 +535,10 @@ async function serverGetRepos(client: BitbucketClient, repoList: string[]): Prom
}));
throwIfAnyFailed(results);
const { validItems: repos, warnings } = processPromiseResults(results);
const { validItems: validRepos, notFoundItems: notFoundRepos } = processPromiseResults(results);
return {
repos,
warnings
validRepos,
notFoundRepos
};
}
@ -557,32 +547,23 @@ function serverShouldExcludeRepo(repo: BitbucketRepository, config: BitbucketCon
const projectName = serverRepo.project!.key;
const repoSlug = serverRepo.slug!;
const repoName = `${projectName}/${repoSlug}`;
let reason = '';
const shouldExclude = (() => {
if (config.exclude?.repos) {
if (micromatch.isMatch(repoName, config.exclude.repos)) {
reason = `\`exclude.repos\` contains ${repoName}`;
return true;
}
if (config.exclude?.repos && config.exclude.repos.includes(`${projectName}/${repoSlug}`)) {
return true;
}
if (!!config.exclude?.archived && serverRepo.archived) {
reason = `\`exclude.archived\` is true`;
return true;
}
if (!!config.exclude?.forks && serverRepo.origin !== undefined) {
reason = `\`exclude.forks\` is true`;
return true;
}
return false;
})();
if (shouldExclude) {
logger.debug(`Excluding repo ${repoName}. Reason: ${reason}`);
logger.debug(`Excluding repo ${projectName}/${repoSlug} because it matches the exclude pattern`);
return true;
}
return false;

View file

@ -1,127 +0,0 @@
import { Prisma, PrismaClient } from "@sourcebot/db";
import { createLogger } from "@sourcebot/shared";
import { ConnectionConfig } from "@sourcebot/schemas/v3/connection.type";
import { loadConfig } from "@sourcebot/shared";
import chokidar, { FSWatcher } from 'chokidar';
import { ConnectionManager } from "./connectionManager.js";
import { SINGLE_TENANT_ORG_ID } from "./constants.js";
import { syncSearchContexts } from "./ee/syncSearchContexts.js";
import isEqual from 'fast-deep-equal';
const logger = createLogger('config-manager');
export class ConfigManager {
private watcher: FSWatcher;
constructor(
private db: PrismaClient,
private connectionManager: ConnectionManager,
configPath: string,
) {
this.watcher = chokidar.watch(configPath, {
ignoreInitial: true, // Don't fire events for existing files
awaitWriteFinish: {
stabilityThreshold: 100, // File size stable for 100ms
pollInterval: 100 // Check every 100ms
},
atomic: true // Handle atomic writes (temp file + rename)
});
this.watcher.on('change', async () => {
logger.info(`Config file ${configPath} changed. Syncing config.`);
try {
await this.syncConfig(configPath);
} catch (error) {
logger.error(`Failed to sync config: ${error}`);
}
});
this.syncConfig(configPath);
}
private syncConfig = async (configPath: string) => {
const config = await loadConfig(configPath);
await this.syncConnections(config.connections);
await syncSearchContexts({
contexts: config.contexts,
orgId: SINGLE_TENANT_ORG_ID,
db: this.db,
});
}
private syncConnections = async (connections?: { [key: string]: ConnectionConfig }) => {
if (connections) {
for (const [key, newConnectionConfig] of Object.entries(connections)) {
const existingConnection = await this.db.connection.findUnique({
where: {
name_orgId: {
name: key,
orgId: SINGLE_TENANT_ORG_ID,
}
}
});
const existingConnectionConfig = existingConnection ? existingConnection.config as unknown as ConnectionConfig : undefined;
const connectionNeedsSyncing =
!existingConnectionConfig ||
!isEqual(existingConnectionConfig, newConnectionConfig);
// Either update the existing connection or create a new one.
const connection = existingConnection ?
await this.db.connection.update({
where: {
id: existingConnection.id,
},
data: {
config: newConnectionConfig as unknown as Prisma.InputJsonValue,
isDeclarative: true,
}
}) :
await this.db.connection.create({
data: {
name: key,
config: newConnectionConfig as unknown as Prisma.InputJsonValue,
connectionType: newConnectionConfig.type,
isDeclarative: true,
org: {
connect: {
id: SINGLE_TENANT_ORG_ID,
}
}
}
});
if (connectionNeedsSyncing) {
logger.info(`Change detected for connection '${key}' (id: ${connection.id}). Creating sync job.`);
await this.connectionManager.createJobs([connection]);
}
}
}
// Delete any connections that are no longer in the config.
const deletedConnections = await this.db.connection.findMany({
where: {
isDeclarative: true,
name: {
notIn: Object.keys(connections ?? {}),
},
orgId: SINGLE_TENANT_ORG_ID,
}
});
for (const connection of deletedConnections) {
logger.info(`Deleting connection with name '${connection.name}'. Connection ID: ${connection.id}`);
await this.db.connection.delete({
where: {
id: connection.id,
}
})
}
}
public dispose = async () => {
await this.watcher.close();
}
}

View file

@ -1,241 +1,212 @@
import * as Sentry from "@sentry/node";
import { Connection, ConnectionSyncJobStatus, PrismaClient } from "@sourcebot/db";
import { createLogger } from "@sourcebot/shared";
import { ConnectionConfig } from "@sourcebot/schemas/v3/connection.type";
import { loadConfig, env } from "@sourcebot/shared";
import { Job, Queue, ReservedJob, Worker } from "groupmq";
import { Redis } from 'ioredis';
import { compileAzureDevOpsConfig, compileBitbucketConfig, compileGenericGitHostConfig, compileGerritConfig, compileGiteaConfig, compileGithubConfig, compileGitlabConfig } from "./repoCompileUtils.js";
import { Connection, ConnectionSyncStatus, PrismaClient, Prisma } from "@sourcebot/db";
import { Job, Queue, Worker } from 'bullmq';
import { Settings } from "./types.js";
import { groupmqLifecycleExceptionWrapper, setIntervalAsync } from "./utils.js";
import { syncSearchContexts } from "./ee/syncSearchContexts.js";
import { ConnectionConfig } from "@sourcebot/schemas/v3/connection.type";
import { createLogger } from "@sourcebot/logger";
import { Redis } from 'ioredis';
import { RepoData, compileGithubConfig, compileGitlabConfig, compileGiteaConfig, compileGerritConfig, compileBitbucketConfig, compileAzureDevOpsConfig, compileGenericGitHostConfig } from "./repoCompileUtils.js";
import { BackendError, BackendException } from "@sourcebot/error";
import { captureEvent } from "./posthog.js";
import { PromClient } from "./promClient.js";
import { GROUPMQ_WORKER_STOP_GRACEFUL_TIMEOUT_MS } from "./constants.js";
import { env } from "./env.js";
import * as Sentry from "@sentry/node";
import { loadConfig, syncSearchContexts } from "@sourcebot/shared";
const LOG_TAG = 'connection-manager';
const logger = createLogger(LOG_TAG);
const createJobLogger = (jobId: string) => createLogger(`${LOG_TAG}:job:${jobId}`);
const QUEUE_NAME = 'connection-sync-queue';
const QUEUE_NAME = 'connectionSyncQueue';
type JobPayload = {
jobId: string,
connectionId: number,
connectionName: string,
orgId: number,
config: ConnectionConfig,
};
type JobResult = {
repoCount: number,
}
const JOB_TIMEOUT_MS = 1000 * 60 * 60 * 2; // 2 hour timeout
export class ConnectionManager {
private worker: Worker<JobPayload>;
private worker: Worker;
private queue: Queue<JobPayload>;
private logger = createLogger('connection-manager');
private interval?: NodeJS.Timeout;
constructor(
private db: PrismaClient,
private settings: Settings,
private redis: Redis,
private promClient: PromClient,
redis: Redis,
) {
this.queue = new Queue<JobPayload>({
redis,
namespace: QUEUE_NAME,
jobTimeoutMs: JOB_TIMEOUT_MS,
maxAttempts: 3,
logger: env.DEBUG_ENABLE_GROUPMQ_LOGGING === 'true',
this.queue = new Queue<JobPayload>(QUEUE_NAME, {
connection: redis,
});
this.worker = new Worker<JobPayload>({
queue: this.queue,
maxStalledCount: 1,
handler: this.runJob.bind(this),
this.worker = new Worker(QUEUE_NAME, this.runSyncJob.bind(this), {
connection: redis,
concurrency: this.settings.maxConnectionSyncJobConcurrency,
...(env.DEBUG_ENABLE_GROUPMQ_LOGGING === 'true' ? {
logger: true,
} : {}),
});
this.worker.on('completed', this.onSyncJobCompleted.bind(this));
this.worker.on('failed', this.onSyncJobFailed.bind(this));
}
this.worker.on('completed', this.onJobCompleted.bind(this));
this.worker.on('failed', this.onJobFailed.bind(this));
this.worker.on('stalled', this.onJobStalled.bind(this));
this.worker.on('error', this.onWorkerError.bind(this));
// graceful-timeout is triggered when a job is still processing after
// worker.close() is called and the timeout period has elapsed. In this case,
// we fail the job with no retry.
this.worker.on('graceful-timeout', this.onJobGracefulTimeout.bind(this));
public async scheduleConnectionSync(connection: Connection) {
await this.db.$transaction(async (tx) => {
await tx.connection.update({
where: { id: connection.id },
data: { syncStatus: ConnectionSyncStatus.IN_SYNC_QUEUE },
});
const connectionConfig = connection.config as unknown as ConnectionConfig;
await this.queue.add('connectionSyncJob', {
connectionId: connection.id,
connectionName: connection.name,
orgId: connection.orgId,
config: connectionConfig,
}, {
removeOnComplete: env.REDIS_REMOVE_ON_COMPLETE,
removeOnFail: env.REDIS_REMOVE_ON_FAIL,
});
this.logger.info(`Added job to queue for connection ${connection.name} (id: ${connection.id})`);
}).catch((err: unknown) => {
this.logger.error(`Failed to add job to queue for connection ${connection.name} (id: ${connection.id}): ${err}`);
});
}
public startScheduler() {
logger.debug('Starting scheduler');
this.interval = setIntervalAsync(async () => {
this.logger.debug('Starting scheduler');
this.interval = setInterval(async () => {
const thresholdDate = new Date(Date.now() - this.settings.resyncConnectionIntervalMs);
const timeoutDate = new Date(Date.now() - JOB_TIMEOUT_MS);
const connections = await this.db.connection.findMany({
where: {
AND: [
OR: [
// When the connection needs to be synced, we want to sync it immediately.
{
OR: [
{ syncedAt: null },
{ syncedAt: { lt: thresholdDate } },
]
syncStatus: ConnectionSyncStatus.SYNC_NEEDED,
},
// When the connection has already been synced, we only want to re-sync if the re-sync interval has elapsed
// (or if the date isn't set for some reason).
{
NOT: {
syncJobs: {
some: {
OR: [
// Don't schedule if there are active jobs that were created within the threshold date.
// This handles the case where a job is stuck in a pending state and will never be scheduled.
{
AND: [
{ status: { in: [ConnectionSyncJobStatus.PENDING, ConnectionSyncJobStatus.IN_PROGRESS] } },
{ createdAt: { gt: timeoutDate } },
]
},
// Don't schedule if there are recent failed jobs (within the threshold date).
{
AND: [
{ status: ConnectionSyncJobStatus.FAILED },
{ completedAt: { gt: thresholdDate } },
]
}
]
}
AND: [
{
OR: [
{ syncStatus: ConnectionSyncStatus.SYNCED },
{ syncStatus: ConnectionSyncStatus.SYNCED_WITH_WARNINGS },
]
},
{
OR: [
{ syncedAt: null },
{ syncedAt: { lt: thresholdDate } },
]
}
}
]
}
]
}
});
if (connections.length > 0) {
await this.createJobs(connections);
for (const connection of connections) {
await this.scheduleConnectionSync(connection);
}
}, this.settings.resyncConnectionPollingIntervalMs);
this.worker.run();
}
public async createJobs(connections: Connection[]) {
const jobs = await this.db.connectionSyncJob.createManyAndReturn({
data: connections.map(connection => ({
connectionId: connection.id,
})),
include: {
connection: true,
}
});
for (const job of jobs) {
logger.info(`Scheduling job ${job.id} for connection ${job.connection.name} (id: ${job.connectionId})`);
await this.queue.add({
groupId: `connection:${job.connectionId}`,
data: {
jobId: job.id,
connectionId: job.connectionId,
connectionName: job.connection.name,
orgId: job.connection.orgId,
},
jobId: job.id,
});
this.promClient.pendingConnectionSyncJobs.inc({ connection: job.connection.name });
}
return jobs.map(job => job.id);
}
private async runJob(job: ReservedJob<JobPayload>): Promise<JobResult> {
const { jobId, connectionName } = job.data;
const logger = createJobLogger(jobId);
logger.info(`Running connection sync job ${jobId} for connection ${connectionName} (id: ${job.data.connectionId}) (attempt ${job.attempts + 1} / ${job.maxAttempts})`);
const currentStatus = await this.db.connectionSyncJob.findUniqueOrThrow({
where: {
id: jobId,
},
select: {
status: true,
}
});
// Fail safe: if the job is not PENDING (first run) or IN_PROGRESS (retry), it indicates the job
// is in an invalid state and should be skipped.
if (currentStatus.status !== ConnectionSyncJobStatus.PENDING && currentStatus.status !== ConnectionSyncJobStatus.IN_PROGRESS) {
throw new Error(`Job ${jobId} is not in a valid state. Expected: ${ConnectionSyncJobStatus.PENDING} or ${ConnectionSyncJobStatus.IN_PROGRESS}. Actual: ${currentStatus.status}. Skipping.`);
}
this.promClient.pendingConnectionSyncJobs.dec({ connection: connectionName });
this.promClient.activeConnectionSyncJobs.inc({ connection: connectionName });
private async runSyncJob(job: Job<JobPayload>): Promise<JobResult> {
const { config, orgId, connectionName } = job.data;
// @note: We aren't actually doing anything with this atm.
const abortController = new AbortController();
const { connection: { config: rawConnectionConfig, orgId } } = await this.db.connectionSyncJob.update({
const connection = await this.db.connection.findUnique({
where: {
id: jobId,
id: job.data.connectionId,
},
});
if (!connection) {
const e = new BackendException(BackendError.CONNECTION_SYNC_CONNECTION_NOT_FOUND, {
message: `Connection ${job.data.connectionId} not found`,
});
Sentry.captureException(e);
throw e;
}
// Reset the syncStatusMetadata to an empty object at the start of the sync job
await this.db.connection.update({
where: {
id: job.data.connectionId,
},
data: {
status: ConnectionSyncJobStatus.IN_PROGRESS,
},
select: {
connection: {
select: {
config: true,
orgId: true,
syncStatus: ConnectionSyncStatus.SYNCING,
syncStatusMetadata: {}
}
})
let result: {
repoData: RepoData[],
notFound: {
users: string[],
orgs: string[],
repos: string[],
}
} = {
repoData: [],
notFound: {
users: [],
orgs: [],
repos: [],
}
};
try {
result = await (async () => {
switch (config.type) {
case 'github': {
return await compileGithubConfig(config, job.data.connectionId, orgId, this.db, abortController);
}
case 'gitlab': {
return await compileGitlabConfig(config, job.data.connectionId, orgId, this.db);
}
case 'gitea': {
return await compileGiteaConfig(config, job.data.connectionId, orgId, this.db);
}
case 'gerrit': {
return await compileGerritConfig(config, job.data.connectionId, orgId);
}
case 'bitbucket': {
return await compileBitbucketConfig(config, job.data.connectionId, orgId, this.db);
}
case 'azuredevops': {
return await compileAzureDevOpsConfig(config, job.data.connectionId, orgId, this.db, abortController);
}
case 'git': {
return await compileGenericGitHostConfig(config, job.data.connectionId, orgId);
}
}
},
});
})();
} catch (err) {
this.logger.error(`Failed to compile repo data for connection ${job.data.connectionId} (${connectionName}): ${err}`);
Sentry.captureException(err);
const config = rawConnectionConfig as unknown as ConnectionConfig;
const result = await (async () => {
switch (config.type) {
case 'github': {
return await compileGithubConfig(config, job.data.connectionId, abortController.signal);
}
case 'gitlab': {
return await compileGitlabConfig(config, job.data.connectionId);
}
case 'gitea': {
return await compileGiteaConfig(config, job.data.connectionId);
}
case 'gerrit': {
return await compileGerritConfig(config, job.data.connectionId);
}
case 'bitbucket': {
return await compileBitbucketConfig(config, job.data.connectionId);
}
case 'azuredevops': {
return await compileAzureDevOpsConfig(config, job.data.connectionId);
}
case 'git': {
return await compileGenericGitHostConfig(config, job.data.connectionId);
}
if (err instanceof BackendException) {
throw err;
} else {
throw new BackendException(BackendError.CONNECTION_SYNC_SYSTEM_ERROR, {
message: `Failed to compile repo data for connection ${job.data.connectionId}`,
});
}
})();
}
let { repoData, warnings } = result;
let { repoData, notFound } = result;
await this.db.connectionSyncJob.update({
// Push the information regarding not found users, orgs, and repos to the connection's syncStatusMetadata. Note that
// this won't be overwritten even if the connection job fails
await this.db.connection.update({
where: {
id: jobId,
id: job.data.connectionId,
},
data: {
warningMessages: warnings,
},
syncStatusMetadata: { notFound }
}
});
// Filter out any duplicates by external_id and external_codeHostUrl.
repoData = repoData.filter((repo, index, self) => {
return index === self.findIndex(r =>
@ -262,7 +233,7 @@ export class ConnectionManager {
}
});
const deleteDuration = performance.now() - deleteStart;
logger.info(`Deleted all RepoToConnection records for connection ${connectionName} (id: ${job.data.connectionId}) in ${deleteDuration}ms`);
this.logger.info(`Deleted all RepoToConnection records for connection ${connectionName} (id: ${job.data.connectionId}) in ${deleteDuration}ms`);
const totalUpsertStart = performance.now();
for (const repo of repoData) {
@ -279,10 +250,10 @@ export class ConnectionManager {
create: repo,
})
const upsertDuration = performance.now() - upsertStart;
logger.debug(`Upserted repo ${repo.displayName} (id: ${repo.external_id}) in ${upsertDuration}ms`);
this.logger.info(`Upserted repo ${repo.displayName} (id: ${repo.external_id}) in ${upsertDuration}ms`);
}
const totalUpsertDuration = performance.now() - totalUpsertStart;
logger.info(`Upserted ${repoData.length} repos for connection ${connectionName} (id: ${job.data.connectionId}) in ${totalUpsertDuration}ms`);
this.logger.info(`Upserted ${repoData.length} repos for connection ${connectionName} (id: ${job.data.connectionId}) in ${totalUpsertDuration}ms`);
}, { timeout: env.CONNECTION_MANAGER_UPSERT_TIMEOUT_MS });
return {
@ -291,179 +262,114 @@ export class ConnectionManager {
}
private onJobCompleted = async (job: Job<JobPayload>) =>
groupmqLifecycleExceptionWrapper('onJobCompleted', logger, async () => {
const logger = createJobLogger(job.id);
const { connectionId, connectionName, orgId } = job.data;
private async onSyncJobCompleted(job: Job<JobPayload>, result: JobResult) {
this.logger.info(`Connection sync job for connection ${job.data.connectionName} (id: ${job.data.connectionId}, jobId: ${job.id}) completed`);
const { connectionId, orgId } = job.data;
await this.db.connectionSyncJob.update({
where: {
id: job.id,
},
data: {
status: ConnectionSyncJobStatus.COMPLETED,
completedAt: new Date(),
connection: {
update: {
syncedAt: new Date(),
}
}
}
});
// After a connection has synced, we need to re-sync the org's search contexts as
// there may be new repos that match the search context's include/exclude patterns.
if (env.CONFIG_PATH) {
try {
const config = await loadConfig(env.CONFIG_PATH);
await syncSearchContexts({
db: this.db,
orgId,
contexts: config.contexts,
});
} catch (err) {
logger.error(`Failed to sync search contexts for connection ${connectionId}: ${err}`);
Sentry.captureException(err);
}
let syncStatusMetadata: Record<string, unknown> = (await this.db.connection.findUnique({
where: { id: connectionId },
select: { syncStatusMetadata: true }
}))?.syncStatusMetadata as Record<string, unknown> ?? {};
const { notFound } = syncStatusMetadata as {
notFound: {
users: string[],
orgs: string[],
repos: string[],
}
};
logger.info(`Connection sync job ${job.id} for connection ${job.data.connectionName} (id: ${job.data.connectionId}) completed`);
this.promClient.activeConnectionSyncJobs.dec({ connection: connectionName });
this.promClient.connectionSyncJobSuccessTotal.inc({ connection: connectionName });
const result = job.returnvalue as JobResult;
captureEvent('backend_connection_sync_job_completed', {
connectionId: connectionId,
repoCount: result.repoCount,
});
});
private onJobFailed = async (job: Job<JobPayload>) =>
groupmqLifecycleExceptionWrapper('onJobFailed', logger, async () => {
const logger = createJobLogger(job.id);
const attempt = job.attemptsMade + 1;
const wasLastAttempt = attempt >= job.opts.attempts;
if (wasLastAttempt) {
const { connection } = await this.db.connectionSyncJob.update({
where: { id: job.id },
data: {
status: ConnectionSyncJobStatus.FAILED,
completedAt: new Date(),
errorMessage: job.failedReason,
},
select: {
connection: true,
}
});
this.promClient.activeConnectionSyncJobs.dec({ connection: connection.name });
this.promClient.connectionSyncJobFailTotal.inc({ connection: connection.name });
logger.error(`Failed job ${job.id} for connection ${connection.name} (id: ${connection.id}). Attempt ${attempt} / ${job.opts.attempts}. Failing job.`);
} else {
const connection = await this.db.connection.findUniqueOrThrow({
where: { id: job.data.connectionId },
});
this.promClient.connectionSyncJobReattemptsTotal.inc({ connection: connection.name });
logger.warn(`Failed job ${job.id} for connection ${connection.name} (id: ${connection.id}). Attempt ${attempt} / ${job.opts.attempts}. Retrying.`);
await this.db.connection.update({
where: {
id: connectionId,
},
data: {
syncStatus:
notFound.users.length > 0 ||
notFound.orgs.length > 0 ||
notFound.repos.length > 0 ? ConnectionSyncStatus.SYNCED_WITH_WARNINGS : ConnectionSyncStatus.SYNCED,
syncedAt: new Date()
}
captureEvent('backend_connection_sync_job_failed', {
connectionId: job.data.connectionId,
error: job.failedReason,
});
});
private onJobStalled = async (jobId: string) =>
groupmqLifecycleExceptionWrapper('onJobStalled', logger, async () => {
const logger = createJobLogger(jobId);
const { connection } = await this.db.connectionSyncJob.update({
where: { id: jobId },
data: {
status: ConnectionSyncJobStatus.FAILED,
completedAt: new Date(),
errorMessage: 'Job stalled',
},
select: {
connection: true,
}
});
// After a connection has synced, we need to re-sync the org's search contexts as
// there may be new repos that match the search context's include/exclude patterns.
if (env.CONFIG_PATH) {
try {
const config = await loadConfig(env.CONFIG_PATH);
this.promClient.activeConnectionSyncJobs.dec({ connection: connection.name });
this.promClient.connectionSyncJobFailTotal.inc({ connection: connection.name });
await syncSearchContexts({
db: this.db,
orgId,
contexts: config.contexts,
});
} catch (err) {
this.logger.error(`Failed to sync search contexts for connection ${connectionId}: ${err}`);
Sentry.captureException(err);
}
}
logger.error(`Job ${jobId} stalled for connection ${connection.name} (id: ${connection.id})`);
captureEvent('backend_connection_sync_job_failed', {
connectionId: connection.id,
error: 'Job stalled',
});
captureEvent('backend_connection_sync_job_completed', {
connectionId: connectionId,
repoCount: result.repoCount,
});
private onJobGracefulTimeout = async (job: Job<JobPayload>) =>
groupmqLifecycleExceptionWrapper('onJobGracefulTimeout', logger, async () => {
const logger = createJobLogger(job.id);
const { connection } = await this.db.connectionSyncJob.update({
where: { id: job.id },
data: {
status: ConnectionSyncJobStatus.FAILED,
completedAt: new Date(),
errorMessage: 'Job timed out',
},
select: {
connection: true,
}
});
this.promClient.activeConnectionSyncJobs.dec({ connection: connection.name });
this.promClient.connectionSyncJobFailTotal.inc({ connection: connection.name });
logger.error(`Job ${job.id} timed out for connection ${connection.name} (id: ${connection.id})`);
captureEvent('backend_connection_sync_job_failed', {
connectionId: connection.id,
error: 'Job timed out',
});
});
private async onWorkerError(error: Error) {
Sentry.captureException(error);
logger.error(`Connection syncer worker error.`, error);
}
public async dispose() {
private async onSyncJobFailed(job: Job<JobPayload> | undefined, err: unknown) {
this.logger.info(`Connection sync job for connection ${job?.data.connectionName} (id: ${job?.data.connectionId}, jobId: ${job?.id}) failed with error: ${err}`);
Sentry.captureException(err, {
tags: {
connectionid: job?.data.connectionId,
jobId: job?.id,
queue: QUEUE_NAME,
}
});
if (job) {
const { connectionId } = job.data;
captureEvent('backend_connection_sync_job_failed', {
connectionId: connectionId,
error: err instanceof BackendException ? err.code : 'UNKNOWN',
});
// We may have pushed some metadata during the execution of the job, so we make sure to not overwrite the metadata here
let syncStatusMetadata: Record<string, unknown> = (await this.db.connection.findUnique({
where: { id: connectionId },
select: { syncStatusMetadata: true }
}))?.syncStatusMetadata as Record<string, unknown> ?? {};
if (err instanceof BackendException) {
syncStatusMetadata = {
...syncStatusMetadata,
error: err.code,
...err.metadata,
}
} else {
syncStatusMetadata = {
...syncStatusMetadata,
error: 'UNKNOWN',
}
}
await this.db.connection.update({
where: {
id: connectionId,
},
data: {
syncStatus: ConnectionSyncStatus.FAILED,
syncStatusMetadata: syncStatusMetadata as Prisma.InputJsonValue,
}
});
}
}
public dispose() {
if (this.interval) {
clearInterval(this.interval);
}
const inProgressJobs = this.worker.getCurrentJobs();
await this.worker.close(GROUPMQ_WORKER_STOP_GRACEFUL_TIMEOUT_MS);
// Manually release group locks for in progress jobs to prevent deadlocks.
// @see: https://github.com/Openpanel-dev/groupmq/issues/8
for (const { job } of inProgressJobs) {
const lockKey = `groupmq:${QUEUE_NAME}:lock:${job.groupId}`;
logger.debug(`Releasing group lock ${lockKey} for in progress job ${job.id}`);
try {
await this.redis.del(lockKey);
} catch (error) {
Sentry.captureException(error);
logger.error(`Failed to release group lock ${lockKey} for in progress job ${job.id}. Error: `, error);
}
}
// @note: As of groupmq v1.0.0, queue.close() will just close the underlying
// redis connection. Since we share the same redis client between, skip this
// step and close the redis client directly in index.ts.
// @see: https://github.com/Openpanel-dev/groupmq/blob/main/src/queue.ts#L1900
// await this.queue.close();
this.worker.close();
this.queue.close();
}
}

View file

@ -5,21 +5,21 @@ type ValidResult<T> = {
data: T[];
};
type WarningResult = {
type: 'warning';
warning: string;
type NotFoundResult = {
type: 'notFound';
value: string;
};
type CustomResult<T> = ValidResult<T> | WarningResult;
type CustomResult<T> = ValidResult<T> | NotFoundResult;
export function processPromiseResults<T>(
results: PromiseSettledResult<CustomResult<T>>[],
): {
validItems: T[];
warnings: string[];
notFoundItems: string[];
} {
const validItems: T[] = [];
const warnings: string[] = [];
const notFoundItems: string[] = [];
results.forEach(result => {
if (result.status === 'fulfilled') {
@ -27,14 +27,14 @@ export function processPromiseResults<T>(
if (value.type === 'valid') {
validItems.push(...value.data);
} else {
warnings.push(value.warning);
notFoundItems.push(value.value);
}
}
});
return {
validItems,
warnings,
notFoundItems,
};
}

View file

@ -1,33 +1,25 @@
import { CodeHostType } from "@sourcebot/db";
import { env } from "@sourcebot/shared";
import path from "path";
import { Settings } from "./types.js";
export const SINGLE_TENANT_ORG_ID = 1;
/**
* Default settings.
*/
export const DEFAULT_SETTINGS: Settings = {
maxFileSize: 2 * 1024 * 1024, // 2MB in bytes
maxTrigramCount: 20000,
reindexIntervalMs: 1000 * 60 * 60, // 1 hour
resyncConnectionIntervalMs: 1000 * 60 * 60 * 24, // 24 hours
resyncConnectionPollingIntervalMs: 1000 * 1, // 1 second
reindexRepoPollingIntervalMs: 1000 * 1, // 1 second
maxConnectionSyncJobConcurrency: 8,
maxRepoIndexingJobConcurrency: 8,
maxRepoGarbageCollectionJobConcurrency: 8,
repoGarbageCollectionGracePeriodMs: 10 * 1000, // 10 seconds
repoIndexTimeoutMs: 1000 * 60 * 60 * 2, // 2 hours
enablePublicAccess: false, // deprected, use FORCE_ENABLE_ANONYMOUS_ACCESS instead
experiment_repoDrivenPermissionSyncIntervalMs: 1000 * 60 * 60 * 24, // 24 hours
experiment_userDrivenPermissionSyncIntervalMs: 1000 * 60 * 60 * 24, // 24 hours
}
export const PERMISSION_SYNC_SUPPORTED_CODE_HOST_TYPES: CodeHostType[] = [
export const PERMISSION_SYNC_SUPPORTED_CODE_HOST_TYPES = [
'github',
'gitlab',
];
export const REPOS_CACHE_DIR = path.join(env.DATA_CACHE_DIR, 'repos');
export const INDEX_CACHE_DIR = path.join(env.DATA_CACHE_DIR, 'index');
// Maximum time to wait for current job to finish
export const GROUPMQ_WORKER_STOP_GRACEFUL_TIMEOUT_MS = 5 * 1000; // 5 seconds
// List of shutdown signals
export const SHUTDOWN_SIGNALS: string[] = [
'SIGHUP',
'SIGINT',
'SIGQUIT',
'SIGILL',
'SIGTRAP',
'SIGABRT',
'SIGBUS',
'SIGFPE',
'SIGSEGV',
'SIGUSR2',
'SIGTERM',
// @note: SIGKILL and SIGSTOP cannot have listeners installed.
// @see: https://nodejs.org/api/process.html#signal-events
];
];

View file

@ -1,303 +0,0 @@
import * as Sentry from "@sentry/node";
import { PrismaClient, AccountPermissionSyncJobStatus, Account} from "@sourcebot/db";
import { env, hasEntitlement, createLogger } from "@sourcebot/shared";
import { Job, Queue, Worker } from "bullmq";
import { Redis } from "ioredis";
import { PERMISSION_SYNC_SUPPORTED_CODE_HOST_TYPES } from "../constants.js";
import { createOctokitFromToken, getReposForAuthenticatedUser } from "../github.js";
import { createGitLabFromOAuthToken, getProjectsForAuthenticatedUser } from "../gitlab.js";
import { Settings } from "../types.js";
import { setIntervalAsync } from "../utils.js";
const LOG_TAG = 'user-permission-syncer';
const logger = createLogger(LOG_TAG);
const createJobLogger = (jobId: string) => createLogger(`${LOG_TAG}:job:${jobId}`);
const QUEUE_NAME = 'accountPermissionSyncQueue';
type AccountPermissionSyncJob = {
jobId: string;
}
export class AccountPermissionSyncer {
private queue: Queue<AccountPermissionSyncJob>;
private worker: Worker<AccountPermissionSyncJob>;
private interval?: NodeJS.Timeout;
constructor(
private db: PrismaClient,
private settings: Settings,
redis: Redis,
) {
this.queue = new Queue<AccountPermissionSyncJob>(QUEUE_NAME, {
connection: redis,
});
this.worker = new Worker<AccountPermissionSyncJob>(QUEUE_NAME, this.runJob.bind(this), {
connection: redis,
concurrency: 1,
});
this.worker.on('completed', this.onJobCompleted.bind(this));
this.worker.on('failed', this.onJobFailed.bind(this));
}
public startScheduler() {
if (!hasEntitlement('permission-syncing')) {
throw new Error('Permission syncing is not supported in current plan.');
}
logger.debug('Starting scheduler');
this.interval = setIntervalAsync(async () => {
const thresholdDate = new Date(Date.now() - this.settings.experiment_userDrivenPermissionSyncIntervalMs);
const accounts = await this.db.account.findMany({
where: {
AND: [
{
provider: {
in: PERMISSION_SYNC_SUPPORTED_CODE_HOST_TYPES
}
},
{
OR: [
{ permissionSyncedAt: null },
{ permissionSyncedAt: { lt: thresholdDate } },
]
},
{
NOT: {
permissionSyncJobs: {
some: {
OR: [
// Don't schedule if there are active jobs
{
status: {
in: [
AccountPermissionSyncJobStatus.PENDING,
AccountPermissionSyncJobStatus.IN_PROGRESS,
],
}
},
// Don't schedule if there are recent failed jobs (within the threshold date). Note `gt` is used here since this is a inverse condition.
{
AND: [
{ status: AccountPermissionSyncJobStatus.FAILED },
{ completedAt: { gt: thresholdDate } },
]
}
]
}
}
}
},
]
}
});
await this.schedulePermissionSync(accounts);
}, 1000 * 5);
}
public async dispose() {
if (this.interval) {
clearInterval(this.interval);
}
await this.worker.close(/* force = */ true);
await this.queue.close();
}
private async schedulePermissionSync(accounts: Account[]) {
// @note: we don't perform this in a transaction because
// we want to avoid the situation where a job is created and run
// prior to the transaction being committed.
const jobs = await this.db.accountPermissionSyncJob.createManyAndReturn({
data: accounts.map(account => ({
accountId: account.id,
})),
});
await this.queue.addBulk(jobs.map((job) => ({
name: 'accountPermissionSyncJob',
data: {
jobId: job.id,
},
opts: {
removeOnComplete: env.REDIS_REMOVE_ON_COMPLETE,
removeOnFail: env.REDIS_REMOVE_ON_FAIL,
}
})))
}
private async runJob(job: Job<AccountPermissionSyncJob>) {
const id = job.data.jobId;
const logger = createJobLogger(id);
const { account } = await this.db.accountPermissionSyncJob.update({
where: {
id,
},
data: {
status: AccountPermissionSyncJobStatus.IN_PROGRESS,
},
select: {
account: {
include: {
user: true,
}
}
}
});
logger.info(`Syncing permissions for ${account.provider} account (id: ${account.id}) for user ${account.user.email}...`);
// Get a list of all repos that the user has access to from all connected accounts.
const repoIds = await (async () => {
const aggregatedRepoIds: Set<number> = new Set();
if (account.provider === 'github') {
if (!account.access_token) {
throw new Error(`User '${account.user.email}' does not have an GitHub OAuth access token associated with their GitHub account.`);
}
const { octokit } = await createOctokitFromToken({
token: account.access_token,
url: env.AUTH_EE_GITHUB_BASE_URL,
});
// @note: we only care about the private repos since we don't need to build a mapping
// for public repos.
// @see: packages/web/src/prisma.ts
const githubRepos = await getReposForAuthenticatedUser(/* visibility = */ 'private', octokit);
const gitHubRepoIds = githubRepos.map(repo => repo.id.toString());
const repos = await this.db.repo.findMany({
where: {
external_codeHostType: 'github',
external_id: {
in: gitHubRepoIds,
}
}
});
repos.forEach(repo => aggregatedRepoIds.add(repo.id));
} else if (account.provider === 'gitlab') {
if (!account.access_token) {
throw new Error(`User '${account.user.email}' does not have a GitLab OAuth access token associated with their GitLab account.`);
}
const api = await createGitLabFromOAuthToken({
oauthToken: account.access_token,
url: env.AUTH_EE_GITLAB_BASE_URL,
});
// @note: we only care about the private and internal repos since we don't need to build a mapping
// for public repos.
// @see: packages/web/src/prisma.ts
const privateGitLabProjects = await getProjectsForAuthenticatedUser('private', api);
const internalGitLabProjects = await getProjectsForAuthenticatedUser('internal', api);
const gitLabProjectIds = [
...privateGitLabProjects,
...internalGitLabProjects,
].map(project => project.id.toString());
const repos = await this.db.repo.findMany({
where: {
external_codeHostType: 'gitlab',
external_id: {
in: gitLabProjectIds,
}
}
});
repos.forEach(repo => aggregatedRepoIds.add(repo.id));
}
return Array.from(aggregatedRepoIds);
})();
await this.db.$transaction([
this.db.account.update({
where: {
id: account.id,
},
data: {
accessibleRepos: {
deleteMany: {},
}
}
}),
this.db.accountToRepoPermission.createMany({
data: repoIds.map(repoId => ({
accountId: account.id,
repoId,
})),
skipDuplicates: true,
})
]);
}
private async onJobCompleted(job: Job<AccountPermissionSyncJob>) {
const logger = createJobLogger(job.data.jobId);
const { account } = await this.db.accountPermissionSyncJob.update({
where: {
id: job.data.jobId,
},
data: {
status: AccountPermissionSyncJobStatus.COMPLETED,
account: {
update: {
permissionSyncedAt: new Date(),
},
},
completedAt: new Date(),
},
select: {
account: {
include: {
user: true,
}
}
}
});
logger.info(`Permissions synced for ${account.provider} account (id: ${account.id}) for user ${account.user.email}`);
}
private async onJobFailed(job: Job<AccountPermissionSyncJob> | undefined, err: Error) {
const logger = createJobLogger(job?.data.jobId ?? 'unknown');
Sentry.captureException(err, {
tags: {
jobId: job?.data.jobId,
queue: QUEUE_NAME,
}
});
const errorMessage = (accountId: string, email: string) => `Account permission sync job failed for account (id: ${accountId}) for user ${email}: ${err.message}`;
if (job) {
const { account } = await this.db.accountPermissionSyncJob.update({
where: {
id: job.data.jobId,
},
data: {
status: AccountPermissionSyncJobStatus.FAILED,
completedAt: new Date(),
errorMessage: err.message,
},
select: {
account: {
include: {
user: true,
}
}
}
});
logger.error(errorMessage(account.id, account.user.email ?? 'unknown user (email not found)'));
} else {
logger.error(errorMessage('unknown account (id not found)', 'unknown user (id not found)'));
}
}
}

View file

@ -1,115 +0,0 @@
import { App } from "@octokit/app";
import { getTokenFromConfig } from "@sourcebot/shared";
import { PrismaClient } from "@sourcebot/db";
import { createLogger } from "@sourcebot/shared";
import { GitHubAppConfig } from "@sourcebot/schemas/v3/index.type";
import { env, loadConfig } from "@sourcebot/shared";
const logger = createLogger('githubAppManager');
const GITHUB_DEFAULT_DEPLOYMENT_HOSTNAME = 'github.com';
type Installation = {
id: number;
appId: number;
account: {
login: string;
type: 'organization' | 'user';
};
};
export class GithubAppManager {
private static instance: GithubAppManager | null = null;
private octokitApps: Map<number, App>;
private installationMap: Map<string, Installation>;
private db: PrismaClient | null = null;
private initialized: boolean = false;
private constructor() {
this.octokitApps = new Map<number, App>();
this.installationMap = new Map<string, Installation>();
}
public static getInstance(): GithubAppManager {
if (!GithubAppManager.instance) {
GithubAppManager.instance = new GithubAppManager();
}
return GithubAppManager.instance;
}
private ensureInitialized(): void {
if (!this.initialized) {
throw new Error('GithubAppManager must be initialized before use. Call init() first.');
}
}
public async init(db: PrismaClient) {
this.db = db;
const config = await loadConfig(env.CONFIG_PATH);
if (!config.apps) {
return;
}
const githubApps = config.apps.filter(app => app.type === 'github') as GitHubAppConfig[];
logger.info(`Found ${githubApps.length} GitHub apps in config`);
for (const app of githubApps) {
const deploymentHostname = app.deploymentHostname as string || GITHUB_DEFAULT_DEPLOYMENT_HOSTNAME;
const privateKey = await getTokenFromConfig(app.privateKey);
const octokitApp = new App({
appId: Number(app.id),
privateKey: privateKey,
});
this.octokitApps.set(Number(app.id), octokitApp);
const installations = await octokitApp.octokit.request("GET /app/installations");
logger.info(`Found ${installations.data.length} GitHub App installations for ${deploymentHostname}/${app.id}:`);
for (const installationData of installations.data) {
if (!installationData.account || !installationData.account.login || !installationData.account.type) {
logger.warn(`Skipping installation ${installationData.id}: missing account data (${installationData.account})`);
continue;
}
logger.info(`\tInstallation ID: ${installationData.id}, Account: ${installationData.account.login}, Type: ${installationData.account.type}`);
const owner = installationData.account.login;
const accountType = installationData.account.type.toLowerCase() as 'organization' | 'user';
const installation: Installation = {
id: installationData.id,
appId: Number(app.id),
account: {
login: owner,
type: accountType,
},
};
this.installationMap.set(this.generateMapKey(owner, deploymentHostname), installation);
}
}
this.initialized = true;
}
public async getInstallationToken(owner: string, deploymentHostname: string = GITHUB_DEFAULT_DEPLOYMENT_HOSTNAME): Promise<string> {
this.ensureInitialized();
const key = this.generateMapKey(owner, deploymentHostname);
const installation = this.installationMap.get(key) as Installation | undefined;
if (!installation) {
throw new Error(`GitHub App Installation not found for ${key}`);
}
const octokitApp = this.octokitApps.get(installation.appId) as App;
const installationOctokit = await octokitApp.getInstallationOctokit(installation.id);
const auth = await installationOctokit.auth({ type: "installation" }) as { expires_at: string, token: string };
return auth.token;
}
public appsConfigured() {
return this.octokitApps.size > 0;
}
private generateMapKey(owner: string, deploymentHostname: string): string {
return `${deploymentHostname}/${owner}`;
}
}

View file

@ -1,14 +1,14 @@
import * as Sentry from "@sentry/node";
import { PrismaClient, Repo, RepoPermissionSyncJobStatus } from "@sourcebot/db";
import { createLogger } from "@sourcebot/shared";
import { env, hasEntitlement } from "@sourcebot/shared";
import { createLogger } from "@sourcebot/logger";
import { hasEntitlement } from "@sourcebot/shared";
import { Job, Queue, Worker } from 'bullmq';
import { Redis } from 'ioredis';
import { PERMISSION_SYNC_SUPPORTED_CODE_HOST_TYPES } from "../constants.js";
import { createOctokitFromToken, getRepoCollaborators, GITHUB_CLOUD_HOSTNAME } from "../github.js";
import { createGitLabFromPersonalAccessToken, getProjectMembers } from "../gitlab.js";
import { env } from "../env.js";
import { createOctokitFromToken, getRepoCollaborators } from "../github.js";
import { Settings } from "../types.js";
import { getAuthCredentialsForRepo, setIntervalAsync } from "../utils.js";
import { getAuthCredentialsForRepo } from "../utils.js";
type RepoPermissionSyncJob = {
jobId: string;
@ -16,9 +16,8 @@ type RepoPermissionSyncJob = {
const QUEUE_NAME = 'repoPermissionSyncQueue';
const LOG_TAG = 'repo-permission-syncer';
const logger = createLogger(LOG_TAG);
const createJobLogger = (jobId: string) => createLogger(`${LOG_TAG}:job:${jobId}`);
const logger = createLogger('repo-permission-syncer');
export class RepoPermissionSyncer {
private queue: Queue<RepoPermissionSyncJob>;
@ -48,34 +47,26 @@ export class RepoPermissionSyncer {
logger.debug('Starting scheduler');
this.interval = setIntervalAsync(async () => {
this.interval = setInterval(async () => {
// @todo: make this configurable
const thresholdDate = new Date(Date.now() - this.settings.experiment_repoDrivenPermissionSyncIntervalMs);
const repos = await this.db.repo.findMany({
// Repos need their permissions to be synced against the code host when...
where: {
// They belong to a code host that supports permissions syncing
AND: [
// They are not public. Public repositories are always visible to all users, therefore we don't
// need to explicitly perform permission syncing for them.
// @see: packages/web/src/prisma.ts
{
isPublic: false
},
// They belong to a code host that supports permissions syncing
{
external_codeHostType: {
in: PERMISSION_SYNC_SUPPORTED_CODE_HOST_TYPES,
}
},
// They have not been synced within the threshold date.
{
OR: [
{ permissionSyncedAt: null },
{ permissionSyncedAt: { lt: thresholdDate } },
],
},
// There aren't any active or recently failed jobs.
{
NOT: {
permissionSyncJobs: {
@ -110,40 +101,37 @@ export class RepoPermissionSyncer {
}, 1000 * 5);
}
public async dispose() {
public dispose() {
if (this.interval) {
clearInterval(this.interval);
}
await this.worker.close(/* force = */ true);
await this.queue.close();
this.worker.close();
this.queue.close();
}
private async schedulePermissionSync(repos: Repo[]) {
// @note: we don't perform this in a transaction because
// we want to avoid the situation where a job is created and run
// prior to the transaction being committed.
const jobs = await this.db.repoPermissionSyncJob.createManyAndReturn({
data: repos.map(repo => ({
repoId: repo.id,
})),
});
await this.db.$transaction(async (tx) => {
const jobs = await tx.repoPermissionSyncJob.createManyAndReturn({
data: repos.map(repo => ({
repoId: repo.id,
})),
});
await this.queue.addBulk(jobs.map((job) => ({
name: 'repoPermissionSyncJob',
data: {
jobId: job.id,
},
opts: {
removeOnComplete: env.REDIS_REMOVE_ON_COMPLETE,
removeOnFail: env.REDIS_REMOVE_ON_FAIL,
}
})))
await this.queue.addBulk(jobs.map((job) => ({
name: 'repoPermissionSyncJob',
data: {
jobId: job.id,
},
opts: {
removeOnComplete: env.REDIS_REMOVE_ON_COMPLETE,
removeOnFail: env.REDIS_REMOVE_ON_FAIL,
}
})))
});
}
private async runJob(job: Job<RepoPermissionSyncJob>) {
const id = job.data.jobId;
const logger = createJobLogger(id);
const { repo } = await this.db.repoPermissionSyncJob.update({
where: {
id,
@ -170,17 +158,16 @@ export class RepoPermissionSyncer {
logger.info(`Syncing permissions for repo ${repo.displayName}...`);
const credentials = await getAuthCredentialsForRepo(repo, logger);
const credentials = await getAuthCredentialsForRepo(repo, this.db, logger);
if (!credentials) {
throw new Error(`No credentials found for repo ${id}`);
}
const accountIds = await (async () => {
const userIds = await (async () => {
if (repo.external_codeHostType === 'github') {
const isGitHubCloud = credentials.hostUrl ? new URL(credentials.hostUrl).hostname === GITHUB_CLOUD_HOSTNAME : false;
const { octokit } = await createOctokitFromToken({
token: credentials.token,
url: isGitHubCloud ? undefined : credentials.hostUrl,
url: credentials.hostUrl,
});
// @note: this is a bit of a hack since the displayName _might_ not be set..
@ -202,33 +189,12 @@ export class RepoPermissionSyncer {
in: githubUserIds,
}
},
});
return accounts.map(account => account.id);
} else if (repo.external_codeHostType === 'gitlab') {
const api = await createGitLabFromPersonalAccessToken({
token: credentials.token,
url: credentials.hostUrl,
});
const projectId = repo.external_id;
if (!projectId) {
throw new Error(`Repo ${id} does not have an external_id`);
}
const members = await getProjectMembers(projectId, api);
const gitlabUserIds = members.map(member => member.id.toString());
const accounts = await this.db.account.findMany({
where: {
provider: 'gitlab',
providerAccountId: {
in: gitlabUserIds,
}
select: {
userId: true,
},
});
return accounts.map(account => account.id);
return accounts.map(account => account.userId);
}
return [];
@ -240,14 +206,14 @@ export class RepoPermissionSyncer {
id: repo.id,
},
data: {
permittedAccounts: {
permittedUsers: {
deleteMany: {},
}
}
}),
this.db.accountToRepoPermission.createMany({
data: accountIds.map(accountId => ({
accountId,
this.db.userToRepoPermission.createMany({
data: userIds.map(userId => ({
userId,
repoId: repo.id,
})),
})
@ -255,8 +221,6 @@ export class RepoPermissionSyncer {
}
private async onJobCompleted(job: Job<RepoPermissionSyncJob>) {
const logger = createJobLogger(job.data.jobId);
const { repo } = await this.db.repoPermissionSyncJob.update({
where: {
id: job.data.jobId,
@ -279,8 +243,6 @@ export class RepoPermissionSyncer {
}
private async onJobFailed(job: Job<RepoPermissionSyncJob> | undefined, err: Error) {
const logger = createJobLogger(job?.data.jobId ?? 'unknown');
Sentry.captureException(err, {
tags: {
jobId: job?.data.jobId,

View file

@ -0,0 +1,266 @@
import * as Sentry from "@sentry/node";
import { PrismaClient, User, UserPermissionSyncJobStatus } from "@sourcebot/db";
import { createLogger } from "@sourcebot/logger";
import { Job, Queue, Worker } from "bullmq";
import { Redis } from "ioredis";
import { PERMISSION_SYNC_SUPPORTED_CODE_HOST_TYPES } from "../constants.js";
import { env } from "../env.js";
import { createOctokitFromToken, getReposForAuthenticatedUser } from "../github.js";
import { hasEntitlement } from "@sourcebot/shared";
import { Settings } from "../types.js";
const logger = createLogger('user-permission-syncer');
const QUEUE_NAME = 'userPermissionSyncQueue';
type UserPermissionSyncJob = {
jobId: string;
}
export class UserPermissionSyncer {
private queue: Queue<UserPermissionSyncJob>;
private worker: Worker<UserPermissionSyncJob>;
private interval?: NodeJS.Timeout;
constructor(
private db: PrismaClient,
private settings: Settings,
redis: Redis,
) {
this.queue = new Queue<UserPermissionSyncJob>(QUEUE_NAME, {
connection: redis,
});
this.worker = new Worker<UserPermissionSyncJob>(QUEUE_NAME, this.runJob.bind(this), {
connection: redis,
concurrency: 1,
});
this.worker.on('completed', this.onJobCompleted.bind(this));
this.worker.on('failed', this.onJobFailed.bind(this));
}
public startScheduler() {
if (!hasEntitlement('permission-syncing')) {
throw new Error('Permission syncing is not supported in current plan.');
}
logger.debug('Starting scheduler');
this.interval = setInterval(async () => {
const thresholdDate = new Date(Date.now() - this.settings.experiment_userDrivenPermissionSyncIntervalMs);
const users = await this.db.user.findMany({
where: {
AND: [
{
accounts: {
some: {
provider: {
in: PERMISSION_SYNC_SUPPORTED_CODE_HOST_TYPES
}
}
}
},
{
OR: [
{ permissionSyncedAt: null },
{ permissionSyncedAt: { lt: thresholdDate } },
]
},
{
NOT: {
permissionSyncJobs: {
some: {
OR: [
// Don't schedule if there are active jobs
{
status: {
in: [
UserPermissionSyncJobStatus.PENDING,
UserPermissionSyncJobStatus.IN_PROGRESS,
],
}
},
// Don't schedule if there are recent failed jobs (within the threshold date). Note `gt` is used here since this is a inverse condition.
{
AND: [
{ status: UserPermissionSyncJobStatus.FAILED },
{ completedAt: { gt: thresholdDate } },
]
}
]
}
}
}
},
]
}
});
await this.schedulePermissionSync(users);
}, 1000 * 5);
}
public dispose() {
if (this.interval) {
clearInterval(this.interval);
}
this.worker.close();
this.queue.close();
}
private async schedulePermissionSync(users: User[]) {
await this.db.$transaction(async (tx) => {
const jobs = await tx.userPermissionSyncJob.createManyAndReturn({
data: users.map(user => ({
userId: user.id,
})),
});
await this.queue.addBulk(jobs.map((job) => ({
name: 'userPermissionSyncJob',
data: {
jobId: job.id,
},
opts: {
removeOnComplete: env.REDIS_REMOVE_ON_COMPLETE,
removeOnFail: env.REDIS_REMOVE_ON_FAIL,
}
})))
});
}
private async runJob(job: Job<UserPermissionSyncJob>) {
const id = job.data.jobId;
const { user } = await this.db.userPermissionSyncJob.update({
where: {
id,
},
data: {
status: UserPermissionSyncJobStatus.IN_PROGRESS,
},
select: {
user: {
include: {
accounts: true,
}
}
}
});
if (!user) {
throw new Error(`User ${id} not found`);
}
logger.info(`Syncing permissions for user ${user.email}...`);
// Get a list of all repos that the user has access to from all connected accounts.
const repoIds = await (async () => {
const aggregatedRepoIds: Set<number> = new Set();
for (const account of user.accounts) {
if (account.provider === 'github') {
if (!account.access_token) {
throw new Error(`User '${user.email}' does not have an GitHub OAuth access token associated with their GitHub account.`);
}
const { octokit } = await createOctokitFromToken({
token: account.access_token,
url: env.AUTH_EE_GITHUB_BASE_URL,
});
// @note: we only care about the private repos since we don't need to build a mapping
// for public repos.
// @see: packages/web/src/prisma.ts
const githubRepos = await getReposForAuthenticatedUser(/* visibility = */ 'private', octokit);
const gitHubRepoIds = githubRepos.map(repo => repo.id.toString());
const repos = await this.db.repo.findMany({
where: {
external_codeHostType: 'github',
external_id: {
in: gitHubRepoIds,
}
}
});
repos.forEach(repo => aggregatedRepoIds.add(repo.id));
}
}
return Array.from(aggregatedRepoIds);
})();
await this.db.$transaction([
this.db.user.update({
where: {
id: user.id,
},
data: {
accessibleRepos: {
deleteMany: {},
}
}
}),
this.db.userToRepoPermission.createMany({
data: repoIds.map(repoId => ({
userId: user.id,
repoId,
})),
skipDuplicates: true,
})
]);
}
private async onJobCompleted(job: Job<UserPermissionSyncJob>) {
const { user } = await this.db.userPermissionSyncJob.update({
where: {
id: job.data.jobId,
},
data: {
status: UserPermissionSyncJobStatus.COMPLETED,
user: {
update: {
permissionSyncedAt: new Date(),
}
},
completedAt: new Date(),
},
select: {
user: true
}
});
logger.info(`Permissions synced for user ${user.email}`);
}
private async onJobFailed(job: Job<UserPermissionSyncJob> | undefined, err: Error) {
Sentry.captureException(err, {
tags: {
jobId: job?.data.jobId,
queue: QUEUE_NAME,
}
});
const errorMessage = (email: string) => `User permission sync job failed for user ${email}: ${err.message}`;
if (job) {
const { user } = await this.db.userPermissionSyncJob.update({
where: {
id: job.data.jobId,
},
data: {
status: UserPermissionSyncJobStatus.FAILED,
completedAt: new Date(),
errorMessage: err.message,
},
select: {
user: true,
}
});
logger.error(errorMessage(user.email ?? user.id));
} else {
logger.error(errorMessage('unknown user (id not found)'));
}
}
}

View file

@ -0,0 +1,62 @@
import { createEnv } from "@t3-oss/env-core";
import { z } from "zod";
import dotenv from 'dotenv';
// Booleans are specified as 'true' or 'false' strings.
const booleanSchema = z.enum(["true", "false"]);
// Numbers are treated as strings in .env files.
// coerce helps us convert them to numbers.
// @see: https://zod.dev/?id=coercion-for-primitives
const numberSchema = z.coerce.number();
dotenv.config({
path: './.env',
});
dotenv.config({
path: './.env.local',
override: true
});
export const env = createEnv({
server: {
SOURCEBOT_ENCRYPTION_KEY: z.string(),
SOURCEBOT_TELEMETRY_DISABLED: booleanSchema.default("false"),
SOURCEBOT_INSTALL_ID: z.string().default("unknown"),
NEXT_PUBLIC_SOURCEBOT_VERSION: z.string().default("unknown"),
DATA_CACHE_DIR: z.string(),
NEXT_PUBLIC_POSTHOG_PAPIK: z.string().optional(),
FALLBACK_GITHUB_CLOUD_TOKEN: z.string().optional(),
FALLBACK_GITLAB_CLOUD_TOKEN: z.string().optional(),
FALLBACK_GITEA_CLOUD_TOKEN: z.string().optional(),
REDIS_URL: z.string().url().default("redis://localhost:6379"),
REDIS_REMOVE_ON_COMPLETE: numberSchema.default(0),
REDIS_REMOVE_ON_FAIL: numberSchema.default(100),
NEXT_PUBLIC_SENTRY_BACKEND_DSN: z.string().optional(),
NEXT_PUBLIC_SENTRY_ENVIRONMENT: z.string().optional(),
LOGTAIL_TOKEN: z.string().optional(),
LOGTAIL_HOST: z.string().url().optional(),
SOURCEBOT_LOG_LEVEL: z.enum(["info", "debug", "warn", "error"]).default("info"),
DATABASE_URL: z.string().url().default("postgresql://postgres:postgres@localhost:5432/postgres"),
CONFIG_PATH: z.string().optional(),
CONNECTION_MANAGER_UPSERT_TIMEOUT_MS: numberSchema.default(300000),
REPO_SYNC_RETRY_BASE_SLEEP_SECONDS: numberSchema.default(60),
GITLAB_CLIENT_QUERY_TIMEOUT_SECONDS: numberSchema.default(60 * 10),
EXPERIMENT_EE_PERMISSION_SYNC_ENABLED: booleanSchema.default('false'),
AUTH_EE_GITHUB_BASE_URL: z.string().optional(),
},
runtimeEnv: process.env,
emptyStringAsUndefined: true,
skipValidation: process.env.SKIP_ENV_VALIDATION === "1",
});

View file

@ -1,8 +1,11 @@
import { GerritConnectionConfig } from "@sourcebot/schemas/v3/index.type";
import { createLogger } from '@sourcebot/shared';
import fetch from 'cross-fetch';
import { GerritConnectionConfig } from "@sourcebot/schemas/v3/index.type"
import { createLogger } from '@sourcebot/logger';
import micromatch from "micromatch";
import { fetchWithRetry, measure } from './utils.js';
import { measure, fetchWithRetry } from './utils.js';
import { BackendError } from '@sourcebot/error';
import { BackendException } from '@sourcebot/error';
import * as Sentry from "@sentry/node";
// https://gerrit-review.googlesource.com/Documentation/rest-api.html
interface GerritProjects {
@ -34,12 +37,29 @@ const logger = createLogger('gerrit');
export const getGerritReposFromConfig = async (config: GerritConnectionConfig): Promise<GerritProject[]> => {
const url = config.url.endsWith('/') ? config.url : `${config.url}/`;
const hostname = new URL(config.url).hostname;
let { durationMs, data: projects } = await measure(async () => {
const fetchFn = () => fetchAllProjects(url);
return fetchWithRetry(fetchFn, `projects from ${url}`, logger);
try {
const fetchFn = () => fetchAllProjects(url);
return fetchWithRetry(fetchFn, `projects from ${url}`, logger);
} catch (err) {
Sentry.captureException(err);
if (err instanceof BackendException) {
throw err;
}
logger.error(`Failed to fetch projects from ${url}`, err);
return null;
}
});
if (!projects) {
const e = new Error(`Failed to fetch projects from ${url}`);
Sentry.captureException(e);
throw e;
}
// include repos by glob if specified in config
if (config.projects) {
projects = projects.filter((project) => {
@ -72,9 +92,27 @@ const fetchAllProjects = async (url: string): Promise<GerritProject[]> => {
logger.debug(`Fetching projects from Gerrit at ${endpointWithParams}`);
let response: Response;
response = await fetch(endpointWithParams);
if (!response.ok) {
throw new Error(`Failed to fetch projects from Gerrit at ${endpointWithParams} with status ${response.status}`);
try {
response = await fetch(endpointWithParams);
if (!response.ok) {
logger.error(`Failed to fetch projects from Gerrit at ${endpointWithParams} with status ${response.status}`);
const e = new BackendException(BackendError.CONNECTION_SYNC_FAILED_TO_FETCH_GERRIT_PROJECTS, {
status: response.status,
});
Sentry.captureException(e);
throw e;
}
} catch (err) {
Sentry.captureException(err);
if (err instanceof BackendException) {
throw err;
}
const status = (err as any).code;
logger.error(`Failed to fetch projects from Gerrit at ${endpointWithParams} with status ${status}`);
throw new BackendException(BackendError.CONNECTION_SYNC_FAILED_TO_FETCH_GERRIT_PROJECTS, {
status: status,
});
}
const text = await response.text();
@ -114,11 +152,11 @@ const shouldExcludeProject = ({
const shouldExclude = (() => {
if ([
'All-Projects',
'All-Users',
'All-Avatars',
'All-Archived-Projects'
].includes(project.name)) {
'All-Projects',
'All-Users',
'All-Avatars',
'All-Archived-Projects'
].includes(project.name)) {
reason = `Project is a special project.`;
return true;
}

View file

@ -1,72 +1,30 @@
import { env } from "@sourcebot/shared";
import { existsSync } from 'node:fs';
import { mkdir } from 'node:fs/promises';
import { dirname, resolve } from 'node:path';
import { CheckRepoActions, GitConfigScope, simpleGit, SimpleGitProgressEvent } from 'simple-git';
import { mkdir } from 'node:fs/promises';
import { env } from './env.js';
type onProgressFn = (event: SimpleGitProgressEvent) => void;
/**
* Creates a simple-git client that has it's working directory
* set to the given path.
*/
const createGitClientForPath = (path: string, onProgress?: onProgressFn, signal?: AbortSignal) => {
if (!existsSync(path)) {
throw new Error(`Path ${path} does not exist`);
}
const parentPath = resolve(dirname(path));
const git = simpleGit({
progress: onProgress,
abort: signal,
})
.env({
...process.env,
/**
* @note on some inside-baseball on why this is necessary: The specific
* issue we saw was that a `git clone` would fail without throwing, and
* then a subsequent `git config` command would run, but since the clone
* failed, it wouldn't be running in a git directory. Git would then walk
* up the directory tree until it either found a git directory (in the case
* of the development env) or it would hit a GIT_DISCOVERY_ACROSS_FILESYSTEM
* error when trying to cross a filesystem boundary (in the prod case).
* GIT_CEILING_DIRECTORIES ensures that this walk will be limited to the
* parent directory.
*/
GIT_CEILING_DIRECTORIES: parentPath,
/**
* Disable git credential prompts. This ensures that git operations will fail
* immediately if credentials are not available, rather than prompting for input.
*/
GIT_TERMINAL_PROMPT: '0',
})
.cwd({
path,
});
return git;
}
export const cloneRepository = async (
{
cloneUrl,
authHeader,
path,
onProgress,
signal,
}: {
cloneUrl: string,
authHeader?: string,
path: string,
onProgress?: onProgressFn
signal?: AbortSignal
}
) => {
try {
await mkdir(path, { recursive: true });
const git = createGitClientForPath(path, onProgress, signal);
const git = simpleGit({
progress: onProgress,
}).cwd({
path,
})
const cloneArgs = [
"--bare",
@ -75,11 +33,7 @@ export const cloneRepository = async (
await git.clone(cloneUrl, path, cloneArgs);
await unsetGitConfig({
path,
keys: ["remote.origin.url"],
signal,
});
await unsetGitConfig(path, ["remote.origin.url"]);
} catch (error: unknown) {
const baseLog = `Failed to clone repository: ${path}`;
@ -100,17 +54,20 @@ export const fetchRepository = async (
authHeader,
path,
onProgress,
signal,
}: {
cloneUrl: string,
authHeader?: string,
path: string,
onProgress?: onProgressFn,
signal?: AbortSignal
onProgress?: onProgressFn
}
) => {
const git = createGitClientForPath(path, onProgress, signal);
try {
const git = simpleGit({
progress: onProgress,
}).cwd({
path: path,
})
if (authHeader) {
await git.addConfig("http.extraHeader", authHeader);
}
@ -133,6 +90,12 @@ export const fetchRepository = async (
}
} finally {
if (authHeader) {
const git = simpleGit({
progress: onProgress,
}).cwd({
path: path,
})
await git.raw(["config", "--unset", "http.extraHeader", authHeader]);
}
}
@ -144,19 +107,10 @@ export const fetchRepository = async (
* that do not exist yet. It will _not_ remove any existing keys that are not
* present in gitConfig.
*/
export const upsertGitConfig = async (
{
path,
gitConfig,
onProgress,
signal,
}: {
path: string,
gitConfig: Record<string, string>,
onProgress?: onProgressFn,
signal?: AbortSignal
}) => {
const git = createGitClientForPath(path, onProgress, signal);
export const upsertGitConfig = async (path: string, gitConfig: Record<string, string>, onProgress?: onProgressFn) => {
const git = simpleGit({
progress: onProgress,
}).cwd(path);
try {
for (const [key, value] of Object.entries(gitConfig)) {
@ -175,19 +129,10 @@ export const upsertGitConfig = async (
* Unsets the specified keys in the git config for the repo at the given path.
* If a key is not set, this is a no-op.
*/
export const unsetGitConfig = async (
{
path,
keys,
onProgress,
signal,
}: {
path: string,
keys: string[],
onProgress?: onProgressFn,
signal?: AbortSignal
}) => {
const git = createGitClientForPath(path, onProgress, signal);
export const unsetGitConfig = async (path: string, keys: string[], onProgress?: onProgressFn) => {
const git = simpleGit({
progress: onProgress,
}).cwd(path);
try {
const configList = await git.listConfig();
@ -210,20 +155,10 @@ export const unsetGitConfig = async (
/**
* Returns true if `path` is the _root_ of a git repository.
*/
export const isPathAValidGitRepoRoot = async ({
path,
onProgress,
signal,
}: {
path: string,
onProgress?: onProgressFn,
signal?: AbortSignal
}) => {
if (!existsSync(path)) {
return false;
}
const git = createGitClientForPath(path, onProgress, signal);
export const isPathAValidGitRepoRoot = async (path: string, onProgress?: onProgressFn) => {
const git = simpleGit({
progress: onProgress,
}).cwd(path);
try {
return git.checkIsRepo(CheckRepoActions.IS_REPO_ROOT);
@ -249,7 +184,7 @@ export const isUrlAValidGitRepo = async (url: string) => {
}
export const getOriginUrl = async (path: string) => {
const git = createGitClientForPath(path);
const git = simpleGit().cwd(path);
try {
const remotes = await git.getConfig('remote.origin.url', GitConfigScope.local);
@ -264,35 +199,18 @@ export const getOriginUrl = async (path: string) => {
}
export const getBranches = async (path: string) => {
const git = createGitClientForPath(path);
const branches = await git.branch();
const git = simpleGit();
const branches = await git.cwd({
path,
}).branch();
return branches.all;
}
export const getTags = async (path: string) => {
const git = createGitClientForPath(path);
const tags = await git.tags();
const git = simpleGit();
const tags = await git.cwd({
path,
}).tags();
return tags.all;
}
export const getCommitHashForRefName = async ({
path,
refName,
}: {
path: string,
refName: string,
}) => {
const git = createGitClientForPath(path);
try {
// The `^{commit}` suffix is used to fully dereference the ref to a commit hash.
const rev = await git.revparse(`${refName}^{commit}`);
return rev;
// @note: Was hitting errors when the repository is empty,
// so we're catching the error and returning undefined.
} catch (error: unknown) {
console.error(error);
return undefined;
}
}

View file

@ -1,24 +1,24 @@
import * as Sentry from "@sentry/node";
import { getTokenFromConfig } from "@sourcebot/shared";
import { createLogger } from '@sourcebot/shared';
import { Api, giteaApi, HttpResponse, Repository as GiteaRepository } from 'gitea-js';
import { GiteaConnectionConfig } from '@sourcebot/schemas/v3/gitea.type';
import { env } from "@sourcebot/shared";
import { getTokenFromConfig, measure } from './utils.js';
import fetch from 'cross-fetch';
import { Api, giteaApi, Repository as GiteaRepository, HttpResponse } from 'gitea-js';
import { createLogger } from '@sourcebot/logger';
import micromatch from 'micromatch';
import { PrismaClient } from '@sourcebot/db';
import { processPromiseResults, throwIfAnyFailed } from './connectionUtils.js';
import { measure } from './utils.js';
import * as Sentry from "@sentry/node";
import { env } from './env.js';
const logger = createLogger('gitea');
const GITEA_CLOUD_HOSTNAME = "gitea.com";
export const getGiteaReposFromConfig = async (config: GiteaConnectionConfig) => {
export const getGiteaReposFromConfig = async (config: GiteaConnectionConfig, orgId: number, db: PrismaClient) => {
const hostname = config.url ?
new URL(config.url).hostname :
GITEA_CLOUD_HOSTNAME;
const token = config.token ?
await getTokenFromConfig(config.token) :
await getTokenFromConfig(config.token, orgId, db, logger) :
hostname === GITEA_CLOUD_HOSTNAME ?
env.FALLBACK_GITEA_CLOUD_TOKEN :
undefined;
@ -29,30 +29,38 @@ export const getGiteaReposFromConfig = async (config: GiteaConnectionConfig) =>
});
let allRepos: GiteaRepository[] = [];
let allWarnings: string[] = [];
let notFound: {
users: string[],
orgs: string[],
repos: string[],
} = {
users: [],
orgs: [],
repos: [],
};
if (config.orgs) {
const { repos, warnings } = await getReposForOrgs(config.orgs, api);
allRepos = allRepos.concat(repos);
allWarnings = allWarnings.concat(warnings);
const { validRepos, notFoundOrgs } = await getReposForOrgs(config.orgs, api);
allRepos = allRepos.concat(validRepos);
notFound.orgs = notFoundOrgs;
}
if (config.repos) {
const { repos, warnings } = await getRepos(config.repos, api);
allRepos = allRepos.concat(repos);
allWarnings = allWarnings.concat(warnings);
const { validRepos, notFoundRepos } = await getRepos(config.repos, api);
allRepos = allRepos.concat(validRepos);
notFound.repos = notFoundRepos;
}
if (config.users) {
const { repos, warnings } = await getReposOwnedByUsers(config.users, api);
allRepos = allRepos.concat(repos);
allWarnings = allWarnings.concat(warnings);
const { validRepos, notFoundUsers } = await getReposOwnedByUsers(config.users, api);
allRepos = allRepos.concat(validRepos);
notFound.users = notFoundUsers;
}
allRepos = allRepos.filter(repo => repo.full_name !== undefined);
allRepos = allRepos.filter(repo => {
if (repo.full_name === undefined) {
logger.warn(`Repository with undefined full_name found: repoId=${repo.id}`);
logger.warn(`Repository with undefined full_name found: orgId=${orgId}, repoId=${repo.id}`);
return false;
}
return true;
@ -70,8 +78,8 @@ export const getGiteaReposFromConfig = async (config: GiteaConnectionConfig) =>
logger.debug(`Found ${repos.length} total repositories.`);
return {
repos,
warnings: allWarnings,
validRepos: repos,
notFound,
};
}
@ -137,11 +145,10 @@ const getReposOwnedByUsers = async <T>(users: string[], api: Api<T>) => {
Sentry.captureException(e);
if (e?.status === 404) {
const warning = `User ${user} not found or no access`;
logger.warn(warning);
logger.error(`User ${user} not found or no access`);
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: user
};
}
throw e;
@ -149,11 +156,11 @@ const getReposOwnedByUsers = async <T>(users: string[], api: Api<T>) => {
}));
throwIfAnyFailed(results);
const { validItems: repos, warnings } = processPromiseResults<GiteaRepository>(results);
const { validItems: validRepos, notFoundItems: notFoundUsers } = processPromiseResults<GiteaRepository>(results);
return {
repos,
warnings,
validRepos,
notFoundUsers,
};
}
@ -178,11 +185,10 @@ const getReposForOrgs = async <T>(orgs: string[], api: Api<T>) => {
Sentry.captureException(e);
if (e?.status === 404) {
const warning = `Organization ${org} not found or no access`;
logger.warn(warning);
logger.error(`Organization ${org} not found or no access`);
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: org
};
}
throw e;
@ -190,16 +196,16 @@ const getReposForOrgs = async <T>(orgs: string[], api: Api<T>) => {
}));
throwIfAnyFailed(results);
const { validItems: repos, warnings } = processPromiseResults<GiteaRepository>(results);
const { validItems: validRepos, notFoundItems: notFoundOrgs } = processPromiseResults<GiteaRepository>(results);
return {
repos,
warnings,
validRepos,
notFoundOrgs,
};
}
const getRepos = async <T>(repoList: string[], api: Api<T>) => {
const results = await Promise.allSettled(repoList.map(async (repo) => {
const getRepos = async <T>(repos: string[], api: Api<T>) => {
const results = await Promise.allSettled(repos.map(async (repo) => {
try {
logger.debug(`Fetching repository info for ${repo}...`);
@ -217,11 +223,10 @@ const getRepos = async <T>(repoList: string[], api: Api<T>) => {
Sentry.captureException(e);
if (e?.status === 404) {
const warning = `Repository ${repo} not found or no access`;
logger.warn(warning);
logger.error(`Repository ${repo} not found or no access`);
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: repo
};
}
throw e;
@ -229,11 +234,11 @@ const getRepos = async <T>(repoList: string[], api: Api<T>) => {
}));
throwIfAnyFailed(results);
const { validItems: repos, warnings } = processPromiseResults<GiteaRepository>(results);
const { validItems: validRepos, notFoundItems: notFoundRepos } = processPromiseResults<GiteaRepository>(results);
return {
repos,
warnings,
validRepos,
notFoundRepos,
};
}

View file

@ -1,21 +1,16 @@
import { Octokit } from "@octokit/rest";
import * as Sentry from "@sentry/node";
import { getTokenFromConfig } from "@sourcebot/shared";
import { createLogger } from "@sourcebot/shared";
import { GithubConnectionConfig } from "@sourcebot/schemas/v3/github.type";
import { env, hasEntitlement } from "@sourcebot/shared";
import { createLogger } from "@sourcebot/logger";
import { getTokenFromConfig, measure, fetchWithRetry } from "./utils.js";
import micromatch from "micromatch";
import pLimit from "p-limit";
import { PrismaClient } from "@sourcebot/db";
import { BackendException, BackendError } from "@sourcebot/error";
import { processPromiseResults, throwIfAnyFailed } from "./connectionUtils.js";
import { GithubAppManager } from "./ee/githubAppManager.js";
import { fetchWithRetry, measure } from "./utils.js";
import * as Sentry from "@sentry/node";
import { env } from "./env.js";
export const GITHUB_CLOUD_HOSTNAME = "github.com";
// Limit concurrent GitHub requests to avoid hitting rate limits and overwhelming installations.
const MAX_CONCURRENT_GITHUB_QUERIES = 5;
const githubQueryLimit = pLimit(MAX_CONCURRENT_GITHUB_QUERIES);
const logger = createLogger('github');
const GITHUB_CLOUD_HOSTNAME = "github.com";
export type OctokitRepository = {
name: string,
@ -47,10 +42,9 @@ const isHttpError = (error: unknown, status: number): boolean => {
}
export const createOctokitFromToken = async ({ token, url }: { token?: string, url?: string }): Promise<{ octokit: Octokit, isAuthenticated: boolean }> => {
const isGitHubCloud = url ? new URL(url).hostname === GITHUB_CLOUD_HOSTNAME : false;
const octokit = new Octokit({
auth: token,
...(url && !isGitHubCloud ? {
...(url ? {
baseUrl: `${url}/api/v3`
} : {}),
});
@ -61,47 +55,13 @@ export const createOctokitFromToken = async ({ token, url }: { token?: string, u
};
}
/**
* Helper function to get an authenticated Octokit instance using GitHub App if available,
* otherwise falls back to the provided octokit instance.
*/
const getOctokitWithGithubApp = async (
octokit: Octokit,
owner: string,
url: string | undefined,
context: string
): Promise<Octokit> => {
if (!hasEntitlement('github-app') || !GithubAppManager.getInstance().appsConfigured()) {
return octokit;
}
try {
const hostname = url ? new URL(url).hostname : GITHUB_CLOUD_HOSTNAME;
const token = await GithubAppManager.getInstance().getInstallationToken(owner, hostname);
const { octokit: octokitFromToken, isAuthenticated } = await createOctokitFromToken({
token,
url,
});
if (isAuthenticated) {
return octokitFromToken;
} else {
logger.error(`Failed to authenticate with GitHub App for ${context}. Falling back to legacy token resolution.`);
return octokit;
}
} catch (error) {
logger.error(`Error getting GitHub App token for ${context}. Falling back to legacy token resolution.`, error);
return octokit;
}
}
export const getGitHubReposFromConfig = async (config: GithubConnectionConfig, signal: AbortSignal): Promise<{ repos: OctokitRepository[], warnings: string[] }> => {
export const getGitHubReposFromConfig = async (config: GithubConnectionConfig, orgId: number, db: PrismaClient, signal: AbortSignal) => {
const hostname = config.url ?
new URL(config.url).hostname :
GITHUB_CLOUD_HOSTNAME;
const token = config.token ?
await getTokenFromConfig(config.token) :
await getTokenFromConfig(config.token, orgId, db, logger) :
hostname === GITHUB_CLOUD_HOSTNAME ?
env.FALLBACK_GITHUB_CLOUD_TOKEN :
undefined;
@ -111,36 +71,57 @@ export const getGitHubReposFromConfig = async (config: GithubConnectionConfig, s
url: config.url,
});
if (isAuthenticated) {
try {
await octokit.rest.users.getAuthenticated();
} catch (error) {
Sentry.captureException(error);
logger.error(`Failed to authenticate with GitHub`, error);
throw error;
if (isHttpError(error, 401)) {
const e = new BackendException(BackendError.CONNECTION_SYNC_INVALID_TOKEN, {
...(config.token && 'secret' in config.token ? {
secretKey: config.token.secret,
} : {}),
});
Sentry.captureException(e);
throw e;
}
const e = new BackendException(BackendError.CONNECTION_SYNC_SYSTEM_ERROR, {
message: `Failed to authenticate with GitHub`,
});
Sentry.captureException(e);
throw e;
}
}
let allRepos: OctokitRepository[] = [];
let allWarnings: string[] = [];
let notFound: {
users: string[],
orgs: string[],
repos: string[],
} = {
users: [],
orgs: [],
repos: [],
};
if (config.orgs) {
const { repos, warnings } = await getReposForOrgs(config.orgs, octokit, signal, config.url);
allRepos = allRepos.concat(repos);
allWarnings = allWarnings.concat(warnings);
const { validRepos, notFoundOrgs } = await getReposForOrgs(config.orgs, octokit, signal);
allRepos = allRepos.concat(validRepos);
notFound.orgs = notFoundOrgs;
}
if (config.repos) {
const { repos, warnings } = await getRepos(config.repos, octokit, signal, config.url);
allRepos = allRepos.concat(repos);
allWarnings = allWarnings.concat(warnings);
const { validRepos, notFoundRepos } = await getRepos(config.repos, octokit, signal);
allRepos = allRepos.concat(validRepos);
notFound.repos = notFoundRepos;
}
if (config.users) {
const { repos, warnings } = await getReposOwnedByUsers(config.users, octokit, signal, config.url);
allRepos = allRepos.concat(repos);
allWarnings = allWarnings.concat(warnings);
const { validRepos, notFoundUsers } = await getReposOwnedByUsers(config.users, octokit, signal);
allRepos = allRepos.concat(validRepos);
notFound.users = notFoundUsers;
}
let repos = allRepos
@ -159,8 +140,8 @@ export const getGitHubReposFromConfig = async (config: GithubConnectionConfig, s
logger.debug(`Found ${repos.length} total repositories.`);
return {
repos,
warnings: allWarnings,
validRepos: repos,
notFound,
};
}
@ -197,12 +178,11 @@ export const getReposForAuthenticatedUser = async (visibility: 'all' | 'private'
}
}
const getReposOwnedByUsers = async (users: string[], octokit: Octokit, signal: AbortSignal, url?: string) => {
const results = await Promise.allSettled(users.map((user) => githubQueryLimit(async () => {
const getReposOwnedByUsers = async (users: string[], octokit: Octokit, signal: AbortSignal) => {
const results = await Promise.allSettled(users.map(async (user) => {
try {
logger.debug(`Fetching repository info for user ${user}...`);
const octokitToUse = await getOctokitWithGithubApp(octokit, user, url, `user ${user}`);
const { durationMs, data } = await measure(async () => {
const fetchFn = async () => {
let query = `user:${user}`;
@ -214,7 +194,7 @@ const getReposOwnedByUsers = async (users: string[], octokit: Octokit, signal: A
// the username as a parameter.
// @see: https://github.com/orgs/community/discussions/24382#discussioncomment-3243958
// @see: https://api.github.com/search/repositories?q=user:USERNAME
const searchResults = await octokitToUse.paginate(octokitToUse.rest.search.repos, {
const searchResults = await octokit.paginate(octokit.rest.search.repos, {
q: query,
per_page: 100,
request: {
@ -238,34 +218,32 @@ const getReposOwnedByUsers = async (users: string[], octokit: Octokit, signal: A
logger.error(`Failed to fetch repositories for user ${user}.`, error);
if (isHttpError(error, 404)) {
const warning = `User ${user} not found or no access`;
logger.warn(warning);
logger.error(`User ${user} not found or no access`);
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: user
};
}
throw error;
}
})));
}));
throwIfAnyFailed(results);
const { validItems: repos, warnings } = processPromiseResults<OctokitRepository>(results);
const { validItems: validRepos, notFoundItems: notFoundUsers } = processPromiseResults<OctokitRepository>(results);
return {
repos,
warnings,
validRepos,
notFoundUsers,
};
}
const getReposForOrgs = async (orgs: string[], octokit: Octokit, signal: AbortSignal, url?: string) => {
const results = await Promise.allSettled(orgs.map((org) => githubQueryLimit(async () => {
const getReposForOrgs = async (orgs: string[], octokit: Octokit, signal: AbortSignal) => {
const results = await Promise.allSettled(orgs.map(async (org) => {
try {
logger.debug(`Fetching repository info for org ${org}...`);
logger.info(`Fetching repository info for org ${org}...`);
const octokitToUse = await getOctokitWithGithubApp(octokit, org, url, `org ${org}`);
const { durationMs, data } = await measure(async () => {
const fetchFn = () => octokitToUse.paginate(octokitToUse.repos.listForOrg, {
const fetchFn = () => octokit.paginate(octokit.repos.listForOrg, {
org: org,
per_page: 100,
request: {
@ -276,7 +254,7 @@ const getReposForOrgs = async (orgs: string[], octokit: Octokit, signal: AbortSi
return fetchWithRetry(fetchFn, `org ${org}`, logger);
});
logger.debug(`Found ${data.length} in org ${org} in ${durationMs}ms.`);
logger.info(`Found ${data.length} in org ${org} in ${durationMs}ms.`);
return {
type: 'valid' as const,
data
@ -286,35 +264,33 @@ const getReposForOrgs = async (orgs: string[], octokit: Octokit, signal: AbortSi
logger.error(`Failed to fetch repositories for org ${org}.`, error);
if (isHttpError(error, 404)) {
const warning = `Organization ${org} not found or no access`;
logger.warn(warning);
logger.error(`Organization ${org} not found or no access`);
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: org
};
}
throw error;
}
})));
}));
throwIfAnyFailed(results);
const { validItems: repos, warnings } = processPromiseResults<OctokitRepository>(results);
const { validItems: validRepos, notFoundItems: notFoundOrgs } = processPromiseResults<OctokitRepository>(results);
return {
repos,
warnings,
validRepos,
notFoundOrgs,
};
}
const getRepos = async (repoList: string[], octokit: Octokit, signal: AbortSignal, url?: string) => {
const results = await Promise.allSettled(repoList.map((repo) => githubQueryLimit(async () => {
const getRepos = async (repoList: string[], octokit: Octokit, signal: AbortSignal) => {
const results = await Promise.allSettled(repoList.map(async (repo) => {
try {
const [owner, repoName] = repo.split('/');
logger.debug(`Fetching repository info for ${repo}...`);
logger.info(`Fetching repository info for ${repo}...`);
const octokitToUse = await getOctokitWithGithubApp(octokit, owner, url, `repo ${repo}`);
const { durationMs, data: result } = await measure(async () => {
const fetchFn = () => octokitToUse.repos.get({
const fetchFn = () => octokit.repos.get({
owner,
repo: repoName,
request: {
@ -325,7 +301,7 @@ const getRepos = async (repoList: string[], octokit: Octokit, signal: AbortSigna
return fetchWithRetry(fetchFn, repo, logger);
});
logger.debug(`Found info for repository ${repo} in ${durationMs}ms`);
logger.info(`Found info for repository ${repo} in ${durationMs}ms`);
return {
type: 'valid' as const,
data: [result.data]
@ -336,23 +312,22 @@ const getRepos = async (repoList: string[], octokit: Octokit, signal: AbortSigna
logger.error(`Failed to fetch repository ${repo}.`, error);
if (isHttpError(error, 404)) {
const warning = `Repository ${repo} not found or no access`;
logger.warn(warning);
logger.error(`Repository ${repo} not found or no access`);
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: repo
};
}
throw error;
}
})));
}));
throwIfAnyFailed(results);
const { validItems: repos, warnings } = processPromiseResults<OctokitRepository>(results);
const { validItems: validRepos, notFoundItems: notFoundRepos } = processPromiseResults<OctokitRepository>(results);
return {
repos,
warnings,
validRepos,
notFoundRepos,
};
}

View file

@ -1,56 +1,47 @@
import { Gitlab, ProjectSchema } from "@gitbeaker/rest";
import * as Sentry from "@sentry/node";
import { getTokenFromConfig } from "@sourcebot/shared";
import { createLogger } from "@sourcebot/shared";
import { GitlabConnectionConfig } from "@sourcebot/schemas/v3/gitlab.type";
import { env } from "@sourcebot/shared";
import micromatch from "micromatch";
import { createLogger } from "@sourcebot/logger";
import { GitlabConnectionConfig } from "@sourcebot/schemas/v3/gitlab.type"
import { getTokenFromConfig, measure, fetchWithRetry } from "./utils.js";
import { PrismaClient } from "@sourcebot/db";
import { processPromiseResults, throwIfAnyFailed } from "./connectionUtils.js";
import { fetchWithRetry, measure } from "./utils.js";
import * as Sentry from "@sentry/node";
import { env } from "./env.js";
const logger = createLogger('gitlab');
export const GITLAB_CLOUD_HOSTNAME = "gitlab.com";
export const createGitLabFromPersonalAccessToken = async ({ token, url }: { token?: string, url?: string }) => {
const isGitLabCloud = url ? new URL(url).hostname === GITLAB_CLOUD_HOSTNAME : false;
return new Gitlab({
token,
...(isGitLabCloud ? {} : {
host: url,
}),
queryTimeout: env.GITLAB_CLIENT_QUERY_TIMEOUT_SECONDS * 1000,
});
}
export const createGitLabFromOAuthToken = async ({ oauthToken, url }: { oauthToken?: string, url?: string }) => {
const isGitLabCloud = url ? new URL(url).hostname === GITLAB_CLOUD_HOSTNAME : false;
return new Gitlab({
oauthToken,
...(isGitLabCloud ? {} : {
host: url,
}),
queryTimeout: env.GITLAB_CLIENT_QUERY_TIMEOUT_SECONDS * 1000,
});
}
export const getGitLabReposFromConfig = async (config: GitlabConnectionConfig) => {
export const getGitLabReposFromConfig = async (config: GitlabConnectionConfig, orgId: number, db: PrismaClient) => {
const hostname = config.url ?
new URL(config.url).hostname :
GITLAB_CLOUD_HOSTNAME;
const token = config.token ?
await getTokenFromConfig(config.token) :
await getTokenFromConfig(config.token, orgId, db, logger) :
hostname === GITLAB_CLOUD_HOSTNAME ?
env.FALLBACK_GITLAB_CLOUD_TOKEN :
undefined;
const api = await createGitLabFromPersonalAccessToken({
token,
url: config.url,
const api = new Gitlab({
...(token ? {
token,
} : {}),
...(config.url ? {
host: config.url,
} : {}),
queryTimeout: env.GITLAB_CLIENT_QUERY_TIMEOUT_SECONDS * 1000,
});
let allRepos: ProjectSchema[] = [];
let allWarnings: string[] = [];
let notFound: {
orgs: string[],
users: string[],
repos: string[],
} = {
orgs: [],
users: [],
repos: [],
};
if (config.all === true) {
if (hostname !== GITLAB_CLOUD_HOSTNAME) {
@ -70,9 +61,7 @@ export const getGitLabReposFromConfig = async (config: GitlabConnectionConfig) =
throw e;
}
} else {
const warning = `Ignoring option all:true in config : host is ${GITLAB_CLOUD_HOSTNAME}`;
logger.warn(warning);
allWarnings = allWarnings.concat(warning);
logger.warn(`Ignoring option all:true in config : host is ${GITLAB_CLOUD_HOSTNAME}`);
}
}
@ -98,11 +87,10 @@ export const getGitLabReposFromConfig = async (config: GitlabConnectionConfig) =
const status = e?.cause?.response?.status;
if (status === 404) {
const warning = `Group ${group} not found or no access`;
logger.warn(warning);
logger.error(`Group ${group} not found or no access`);
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: group
};
}
throw e;
@ -110,9 +98,9 @@ export const getGitLabReposFromConfig = async (config: GitlabConnectionConfig) =
}));
throwIfAnyFailed(results);
const { validItems: validRepos, warnings } = processPromiseResults(results);
const { validItems: validRepos, notFoundItems: notFoundOrgs } = processPromiseResults(results);
allRepos = allRepos.concat(validRepos);
allWarnings = allWarnings.concat(warnings);
notFound.orgs = notFoundOrgs;
}
if (config.users) {
@ -136,11 +124,10 @@ export const getGitLabReposFromConfig = async (config: GitlabConnectionConfig) =
const status = e?.cause?.response?.status;
if (status === 404) {
const warning = `User ${user} not found or no access`;
logger.warn(warning);
logger.error(`User ${user} not found or no access`);
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: user
};
}
throw e;
@ -148,9 +135,9 @@ export const getGitLabReposFromConfig = async (config: GitlabConnectionConfig) =
}));
throwIfAnyFailed(results);
const { validItems: validRepos, warnings } = processPromiseResults(results);
const { validItems: validRepos, notFoundItems: notFoundUsers } = processPromiseResults(results);
allRepos = allRepos.concat(validRepos);
allWarnings = allWarnings.concat(warnings);
notFound.users = notFoundUsers;
}
if (config.projects) {
@ -173,11 +160,10 @@ export const getGitLabReposFromConfig = async (config: GitlabConnectionConfig) =
const status = e?.cause?.response?.status;
if (status === 404) {
const warning = `Project ${project} not found or no access`;
logger.warn(warning);
logger.error(`Project ${project} not found or no access`);
return {
type: 'warning' as const,
warning
type: 'notFound' as const,
value: project
};
}
throw e;
@ -185,9 +171,9 @@ export const getGitLabReposFromConfig = async (config: GitlabConnectionConfig) =
}));
throwIfAnyFailed(results);
const { validItems: validRepos, warnings } = processPromiseResults(results);
const { validItems: validRepos, notFoundItems: notFoundRepos } = processPromiseResults(results);
allRepos = allRepos.concat(validRepos);
allWarnings = allWarnings.concat(warnings);
notFound.repos = notFoundRepos;
}
let repos = allRepos
@ -206,8 +192,8 @@ export const getGitLabReposFromConfig = async (config: GitlabConnectionConfig) =
logger.debug(`Found ${repos.length} total repositories.`);
return {
repos,
warnings: allWarnings,
validRepos: repos,
notFound,
};
}
@ -277,38 +263,4 @@ export const shouldExcludeProject = ({
}
return false;
}
export const getProjectMembers = async (projectId: string, api: InstanceType<typeof Gitlab>) => {
try {
const fetchFn = () => api.ProjectMembers.all(projectId, {
perPage: 100,
includeInherited: true,
});
const members = await fetchWithRetry(fetchFn, `project ${projectId}`, logger);
return members as Array<{ id: number }>;
} catch (error) {
Sentry.captureException(error);
logger.error(`Failed to fetch members for project ${projectId}.`, error);
throw error;
}
}
export const getProjectsForAuthenticatedUser = async (visibility: 'private' | 'internal' | 'public' | 'all' = 'all', api: InstanceType<typeof Gitlab>) => {
try {
const fetchFn = () => api.Projects.all({
membership: true,
...(visibility !== 'all' ? {
visibility,
} : {}),
perPage: 100,
});
const response = await fetchWithRetry(fetchFn, `authenticated user`, logger);
return response;
} catch (error) {
Sentry.captureException(error);
logger.error(`Failed to fetch projects for authenticated user.`, error);
throw error;
}
}

View file

@ -1,28 +1,41 @@
import "./instrument.js";
import * as Sentry from "@sentry/node";
import { PrismaClient } from "@sourcebot/db";
import { createLogger, env, getConfigSettings, getDBConnectionString, hasEntitlement } from "@sourcebot/shared";
import 'express-async-errors';
import { createLogger } from "@sourcebot/logger";
import { hasEntitlement, loadConfig } from '@sourcebot/shared';
import { existsSync } from 'fs';
import { mkdir } from 'fs/promises';
import { Redis } from 'ioredis';
import { Api } from "./api.js";
import { ConfigManager } from "./configManager.js";
import path from 'path';
import { ConnectionManager } from './connectionManager.js';
import { INDEX_CACHE_DIR, REPOS_CACHE_DIR, SHUTDOWN_SIGNALS } from './constants.js';
import { AccountPermissionSyncer } from "./ee/accountPermissionSyncer.js";
import { GithubAppManager } from "./ee/githubAppManager.js";
import { DEFAULT_SETTINGS } from './constants.js';
import { env } from "./env.js";
import { RepoPermissionSyncer } from './ee/repoPermissionSyncer.js';
import { shutdownPosthog } from "./posthog.js";
import { PromClient } from './promClient.js';
import { RepoIndexManager } from "./repoIndexManager.js";
import { RepoManager } from './repoManager.js';
import { AppContext } from "./types.js";
import { UserPermissionSyncer } from "./ee/userPermissionSyncer.js";
const logger = createLogger('backend-entrypoint');
const reposPath = REPOS_CACHE_DIR;
const indexPath = INDEX_CACHE_DIR;
const getSettings = async (configPath?: string) => {
if (!configPath) {
return DEFAULT_SETTINGS;
}
const config = await loadConfig(configPath);
return {
...DEFAULT_SETTINGS,
...config.settings,
}
}
const cacheDir = env.DATA_CACHE_DIR;
const reposPath = path.join(cacheDir, 'repos');
const indexPath = path.join(cacheDir, 'index');
if (!existsSync(reposPath)) {
await mkdir(reposPath, { recursive: true });
@ -31,42 +44,38 @@ if (!existsSync(indexPath)) {
await mkdir(indexPath, { recursive: true });
}
const prisma = new PrismaClient({
datasources: {
db: {
url: getDBConnectionString(),
},
},
});
const context: AppContext = {
indexPath,
reposPath,
cachePath: cacheDir,
}
const prisma = new PrismaClient();
const redis = new Redis(env.REDIS_URL, {
maxRetriesPerRequest: null
});
try {
await redis.ping();
redis.ping().then(() => {
logger.info('Connected to redis');
} catch (err: unknown) {
logger.error('Failed to connect to redis. Error:', err);
}).catch((err: unknown) => {
logger.error('Failed to connect to redis');
logger.error(err);
process.exit(1);
}
});
const promClient = new PromClient();
const settings = await getConfigSettings(env.CONFIG_PATH);
const settings = await getSettings(env.CONFIG_PATH);
if (hasEntitlement('github-app')) {
await GithubAppManager.getInstance().init(prisma);
}
const connectionManager = new ConnectionManager(prisma, settings, redis, promClient);
const connectionManager = new ConnectionManager(prisma, settings, redis);
const repoManager = new RepoManager(prisma, settings, redis, promClient, context);
const repoPermissionSyncer = new RepoPermissionSyncer(prisma, settings, redis);
const accountPermissionSyncer = new AccountPermissionSyncer(prisma, settings, redis);
const repoIndexManager = new RepoIndexManager(prisma, settings, redis, promClient);
const configManager = new ConfigManager(prisma, connectionManager, env.CONFIG_PATH);
const userPermissionSyncer = new UserPermissionSyncer(prisma, settings, redis);
await repoManager.validateIndexedReposHaveShards();
connectionManager.startScheduler();
repoIndexManager.startScheduler();
repoManager.startScheduler();
if (env.EXPERIMENT_EE_PERMISSION_SYNC_ENABLED === 'true' && !hasEntitlement('permission-syncing')) {
logger.error('Permission syncing is not supported in current plan. Please contact team@sourcebot.dev for assistance.');
@ -74,77 +83,31 @@ if (env.EXPERIMENT_EE_PERMISSION_SYNC_ENABLED === 'true' && !hasEntitlement('per
}
else if (env.EXPERIMENT_EE_PERMISSION_SYNC_ENABLED === 'true' && hasEntitlement('permission-syncing')) {
repoPermissionSyncer.startScheduler();
accountPermissionSyncer.startScheduler();
userPermissionSyncer.startScheduler();
}
const api = new Api(
promClient,
prisma,
connectionManager,
repoIndexManager,
);
logger.info('Worker started.');
const listenToShutdownSignals = () => {
const signals = SHUTDOWN_SIGNALS;
let receivedSignal = false;
const cleanup = async (signal: string) => {
try {
if (receivedSignal) {
return;
}
receivedSignal = true;
logger.info(`Received ${signal}, cleaning up...`);
await repoIndexManager.dispose()
await connectionManager.dispose()
await repoPermissionSyncer.dispose()
await accountPermissionSyncer.dispose()
await configManager.dispose()
await prisma.$disconnect();
await redis.quit();
await api.dispose();
await shutdownPosthog();
logger.info('All workers shut down gracefully');
signals.forEach(sig => process.removeListener(sig, cleanup));
return 0;
} catch (error) {
Sentry.captureException(error);
logger.error('Error shutting down worker:', error);
return 1;
}
}
signals.forEach(signal => {
process.on(signal, (err) => {
cleanup(err).then(code => {
process.exit(code);
});
});
});
// Register handlers for uncaught exceptions and unhandled rejections
process.on('uncaughtException', (err) => {
logger.error(`Uncaught exception: ${err.message}`);
cleanup('uncaughtException').then(() => {
process.exit(1);
});
});
process.on('unhandledRejection', (reason, promise) => {
logger.error(`Unhandled rejection at: ${promise}, reason: ${reason}`);
cleanup('unhandledRejection').then(() => {
process.exit(1);
});
});
const cleanup = async (signal: string) => {
logger.info(`Recieved ${signal}, cleaning up...`);
connectionManager.dispose();
repoManager.dispose();
repoPermissionSyncer.dispose();
userPermissionSyncer.dispose();
await prisma.$disconnect();
await redis.quit();
}
listenToShutdownSignals();
process.on('SIGINT', () => cleanup('SIGINT').finally(() => process.exit(0)));
process.on('SIGTERM', () => cleanup('SIGTERM').finally(() => process.exit(0)));
// Register handlers for uncaught exceptions and unhandled rejections
process.on('uncaughtException', (err) => {
logger.error(`Uncaught exception: ${err.message}`);
cleanup('uncaughtException').finally(() => process.exit(1));
});
process.on('unhandledRejection', (reason, promise) => {
logger.error(`Unhandled rejection at: ${promise}, reason: ${reason}`);
cleanup('unhandledRejection').finally(() => process.exit(1));
});

View file

@ -1,6 +1,6 @@
import * as Sentry from "@sentry/node";
import { createLogger } from "@sourcebot/shared";
import { env } from "@sourcebot/shared/client";
import { env } from "./env.js";
import { createLogger } from "@sourcebot/logger";
const logger = createLogger('instrument');

View file

@ -1,13 +1,12 @@
import { env as clientEnv } from "@sourcebot/shared/client";
import { env } from "@sourcebot/shared";
import { PostHog } from 'posthog-node';
import { PosthogEvent, PosthogEventMap } from './posthogEvents.js';
import { env } from './env.js';
let posthog: PostHog | undefined = undefined;
if (env.POSTHOG_PAPIK) {
if (env.NEXT_PUBLIC_POSTHOG_PAPIK) {
posthog = new PostHog(
env.POSTHOG_PAPIK,
env.NEXT_PUBLIC_POSTHOG_PAPIK,
{
host: "https://us.i.posthog.com",
}
@ -24,11 +23,9 @@ export function captureEvent<E extends PosthogEvent>(event: E, properties: Posth
event: event,
properties: {
...properties,
sourcebot_version: clientEnv.NEXT_PUBLIC_SOURCEBOT_VERSION,
sourcebot_version: env.NEXT_PUBLIC_SOURCEBOT_VERSION,
},
});
}
export async function shutdownPosthog() {
await posthog?.shutdown();
}
await posthog?.shutdown();

View file

@ -1,94 +1,109 @@
import express, { Request, Response } from 'express';
import client, { Registry, Counter, Gauge } from 'prom-client';
import { createLogger } from "@sourcebot/logger";
const logger = createLogger('prometheus-client');
export class PromClient {
public registry: Registry;
private registry: Registry;
private app: express.Application;
public activeRepoIndexingJobs: Gauge<string>;
public pendingRepoIndexingJobs: Gauge<string>;
public repoIndexingReattemptsTotal: Counter<string>;
public repoIndexingFailTotal: Counter<string>;
public repoIndexingSuccessTotal: Counter<string>;
public activeRepoIndexJobs: Gauge<string>;
public pendingRepoIndexJobs: Gauge<string>;
public repoIndexJobReattemptsTotal: Counter<string>;
public repoIndexJobFailTotal: Counter<string>;
public repoIndexJobSuccessTotal: Counter<string>;
public activeRepoGarbageCollectionJobs: Gauge<string>;
public repoGarbageCollectionErrorTotal: Counter<string>;
public repoGarbageCollectionFailTotal: Counter<string>;
public repoGarbageCollectionSuccessTotal: Counter<string>;
public activeConnectionSyncJobs: Gauge<string>;
public pendingConnectionSyncJobs: Gauge<string>;
public connectionSyncJobReattemptsTotal: Counter<string>;
public connectionSyncJobFailTotal: Counter<string>;
public connectionSyncJobSuccessTotal: Counter<string>;
public readonly PORT = 3060;
constructor() {
this.registry = new Registry();
this.activeRepoIndexJobs = new Gauge({
name: 'active_repo_index_jobs',
help: 'The number of repo jobs in progress',
labelNames: ['repo', 'type'],
this.activeRepoIndexingJobs = new Gauge({
name: 'active_repo_indexing_jobs',
help: 'The number of repo indexing jobs in progress',
labelNames: ['repo'],
});
this.registry.registerMetric(this.activeRepoIndexJobs);
this.registry.registerMetric(this.activeRepoIndexingJobs);
this.pendingRepoIndexJobs = new Gauge({
name: 'pending_repo_index_jobs',
help: 'The number of repo jobs waiting in queue',
labelNames: ['repo', 'type'],
this.pendingRepoIndexingJobs = new Gauge({
name: 'pending_repo_indexing_jobs',
help: 'The number of repo indexing jobs waiting in queue',
labelNames: ['repo'],
});
this.registry.registerMetric(this.pendingRepoIndexJobs);
this.registry.registerMetric(this.pendingRepoIndexingJobs);
this.repoIndexJobReattemptsTotal = new Counter({
name: 'repo_index_job_reattempts',
help: 'The number of repo job reattempts',
labelNames: ['repo', 'type'],
this.repoIndexingReattemptsTotal = new Counter({
name: 'repo_indexing_reattempts',
help: 'The number of repo indexing reattempts',
labelNames: ['repo'],
});
this.registry.registerMetric(this.repoIndexJobReattemptsTotal);
this.registry.registerMetric(this.repoIndexingReattemptsTotal);
this.repoIndexJobFailTotal = new Counter({
name: 'repo_index_job_fails',
help: 'The number of repo job fails',
labelNames: ['repo', 'type'],
this.repoIndexingFailTotal = new Counter({
name: 'repo_indexing_fails',
help: 'The number of repo indexing fails',
labelNames: ['repo'],
});
this.registry.registerMetric(this.repoIndexJobFailTotal);
this.registry.registerMetric(this.repoIndexingFailTotal);
this.repoIndexJobSuccessTotal = new Counter({
name: 'repo_index_job_successes',
help: 'The number of repo job successes',
labelNames: ['repo', 'type'],
this.repoIndexingSuccessTotal = new Counter({
name: 'repo_indexing_successes',
help: 'The number of repo indexing successes',
labelNames: ['repo'],
});
this.registry.registerMetric(this.repoIndexJobSuccessTotal);
this.registry.registerMetric(this.repoIndexingSuccessTotal);
this.activeConnectionSyncJobs = new Gauge({
name: 'active_connection_sync_jobs',
help: 'The number of connection sync jobs in progress',
labelNames: ['connection'],
this.activeRepoGarbageCollectionJobs = new Gauge({
name: 'active_repo_garbage_collection_jobs',
help: 'The number of repo garbage collection jobs in progress',
labelNames: ['repo'],
});
this.registry.registerMetric(this.activeConnectionSyncJobs);
this.registry.registerMetric(this.activeRepoGarbageCollectionJobs);
this.pendingConnectionSyncJobs = new Gauge({
name: 'pending_connection_sync_jobs',
help: 'The number of connection sync jobs waiting in queue',
labelNames: ['connection'],
this.repoGarbageCollectionErrorTotal = new Counter({
name: 'repo_garbage_collection_errors',
help: 'The number of repo garbage collection errors',
labelNames: ['repo'],
});
this.registry.registerMetric(this.pendingConnectionSyncJobs);
this.registry.registerMetric(this.repoGarbageCollectionErrorTotal);
this.connectionSyncJobReattemptsTotal = new Counter({
name: 'connection_sync_job_reattempts',
help: 'The number of connection sync job reattempts',
labelNames: ['connection'],
this.repoGarbageCollectionFailTotal = new Counter({
name: 'repo_garbage_collection_fails',
help: 'The number of repo garbage collection fails',
labelNames: ['repo'],
});
this.registry.registerMetric(this.connectionSyncJobReattemptsTotal);
this.registry.registerMetric(this.repoGarbageCollectionFailTotal);
this.connectionSyncJobFailTotal = new Counter({
name: 'connection_sync_job_fails',
help: 'The number of connection sync job fails',
labelNames: ['connection'],
this.repoGarbageCollectionSuccessTotal = new Counter({
name: 'repo_garbage_collection_successes',
help: 'The number of repo garbage collection successes',
labelNames: ['repo'],
});
this.registry.registerMetric(this.connectionSyncJobFailTotal);
this.connectionSyncJobSuccessTotal = new Counter({
name: 'connection_sync_job_successes',
help: 'The number of connection sync job successes',
labelNames: ['connection'],
});
this.registry.registerMetric(this.connectionSyncJobSuccessTotal);
this.registry.registerMetric(this.repoGarbageCollectionSuccessTotal);
client.collectDefaultMetrics({
register: this.registry,
});
this.app = express();
this.app.get('/metrics', async (req: Request, res: Response) => {
res.set('Content-Type', this.registry.contentType);
const metrics = await this.registry.metrics();
res.end(metrics);
});
this.app.listen(this.PORT, () => {
logger.info(`Prometheus metrics server is running on port ${this.PORT}`);
});
}
getRegistry(): Registry {
return this.registry;
}
}

View file

@ -7,42 +7,39 @@ import { BitbucketRepository, getBitbucketReposFromConfig } from "./bitbucket.js
import { getAzureDevOpsReposFromConfig } from "./azuredevops.js";
import { SchemaRestRepository as BitbucketServerRepository } from "@coderabbitai/bitbucket/server/openapi";
import { SchemaRepository as BitbucketCloudRepository } from "@coderabbitai/bitbucket/cloud/openapi";
import { CodeHostType, Prisma } from '@sourcebot/db';
import { Prisma, PrismaClient } from '@sourcebot/db';
import { WithRequired } from "./types.js"
import { marshalBool } from "./utils.js";
import { createLogger } from '@sourcebot/shared';
import { createLogger } from '@sourcebot/logger';
import { BitbucketConnectionConfig, GerritConnectionConfig, GiteaConnectionConfig, GitlabConnectionConfig, GenericGitHostConnectionConfig, AzureDevOpsConnectionConfig } from '@sourcebot/schemas/v3/connection.type';
import { ProjectVisibility } from "azure-devops-node-api/interfaces/CoreInterfaces.js";
import { RepoMetadata } from './types.js';
import path from 'path';
import { glob } from 'glob';
import { getOriginUrl, isPathAValidGitRepoRoot, isUrlAValidGitRepo } from './git.js';
import assert from 'assert';
import GitUrlParse from 'git-url-parse';
import { RepoMetadata } from '@sourcebot/shared';
import { SINGLE_TENANT_ORG_ID } from './constants.js';
import pLimit from 'p-limit';
export type RepoData = WithRequired<Prisma.RepoCreateInput, 'connections'>;
const logger = createLogger('repo-compile-utils');
// Limit concurrent git operations to prevent resource exhaustion (EAGAIN errors)
// when processing thousands of repositories simultaneously
const MAX_CONCURRENT_GIT_OPERATIONS = 100;
const gitOperationLimit = pLimit(MAX_CONCURRENT_GIT_OPERATIONS);
type CompileResult = {
repoData: RepoData[],
warnings: string[],
}
export const compileGithubConfig = async (
config: GithubConnectionConfig,
connectionId: number,
signal: AbortSignal): Promise<CompileResult> => {
const gitHubReposResult = await getGitHubReposFromConfig(config, signal);
const gitHubRepos = gitHubReposResult.repos;
const warnings = gitHubReposResult.warnings;
orgId: number,
db: PrismaClient,
abortController: AbortController): Promise<{
repoData: RepoData[],
notFound: {
users: string[],
orgs: string[],
repos: string[],
}
}> => {
const gitHubReposResult = await getGitHubReposFromConfig(config, orgId, db, abortController.signal);
const gitHubRepos = gitHubReposResult.validRepos;
const notFound = gitHubReposResult.notFound;
const hostUrl = config.url ?? 'https://github.com';
const repoNameRoot = new URL(hostUrl)
@ -71,7 +68,7 @@ export const compileGithubConfig = async (
isPublic: isPublic,
org: {
connect: {
id: SINGLE_TENANT_ORG_ID,
id: orgId,
},
},
connections: {
@ -103,17 +100,19 @@ export const compileGithubConfig = async (
return {
repoData: repos,
warnings,
notFound,
};
}
export const compileGitlabConfig = async (
config: GitlabConnectionConfig,
connectionId: number): Promise<CompileResult> => {
connectionId: number,
orgId: number,
db: PrismaClient) => {
const gitlabReposResult = await getGitLabReposFromConfig(config);
const gitlabRepos = gitlabReposResult.repos;
const warnings = gitlabReposResult.warnings;
const gitlabReposResult = await getGitLabReposFromConfig(config, orgId, db);
const gitlabRepos = gitlabReposResult.validRepos;
const notFound = gitlabReposResult.notFound;
const hostUrl = config.url ?? 'https://gitlab.com';
const repoNameRoot = new URL(hostUrl)
@ -124,6 +123,7 @@ export const compileGitlabConfig = async (
const projectUrl = `${hostUrl}/${project.path_with_namespace}`;
const cloneUrl = new URL(project.http_url_to_repo);
const isFork = project.forked_from_project !== undefined;
// @todo: we will need to double check whether 'internal' should also be considered public or not.
const isPublic = project.visibility === 'public';
const repoDisplayName = project.path_with_namespace;
const repoName = path.join(repoNameRoot, repoDisplayName);
@ -147,7 +147,7 @@ export const compileGitlabConfig = async (
isArchived: !!project.archived,
org: {
connect: {
id: SINGLE_TENANT_ORG_ID,
id: orgId,
},
},
connections: {
@ -177,17 +177,19 @@ export const compileGitlabConfig = async (
return {
repoData: repos,
warnings,
notFound,
};
}
export const compileGiteaConfig = async (
config: GiteaConnectionConfig,
connectionId: number): Promise<CompileResult> => {
connectionId: number,
orgId: number,
db: PrismaClient) => {
const giteaReposResult = await getGiteaReposFromConfig(config);
const giteaRepos = giteaReposResult.repos;
const warnings = giteaReposResult.warnings;
const giteaReposResult = await getGiteaReposFromConfig(config, orgId, db);
const giteaRepos = giteaReposResult.validRepos;
const notFound = giteaReposResult.notFound;
const hostUrl = config.url ?? 'https://gitea.com';
const repoNameRoot = new URL(hostUrl)
@ -218,7 +220,7 @@ export const compileGiteaConfig = async (
isArchived: !!repo.archived,
org: {
connect: {
id: SINGLE_TENANT_ORG_ID,
id: orgId,
},
},
connections: {
@ -246,13 +248,14 @@ export const compileGiteaConfig = async (
return {
repoData: repos,
warnings,
notFound,
};
}
export const compileGerritConfig = async (
config: GerritConnectionConfig,
connectionId: number): Promise<CompileResult> => {
connectionId: number,
orgId: number) => {
const gerritRepos = await getGerritReposFromConfig(config);
const hostUrl = config.url;
@ -298,7 +301,7 @@ export const compileGerritConfig = async (
isArchived: false,
org: {
connect: {
id: SINGLE_TENANT_ORG_ID,
id: orgId,
},
},
connections: {
@ -326,17 +329,23 @@ export const compileGerritConfig = async (
return {
repoData: repos,
warnings: [],
notFound: {
users: [],
orgs: [],
repos: [],
}
};
}
export const compileBitbucketConfig = async (
config: BitbucketConnectionConfig,
connectionId: number): Promise<CompileResult> => {
connectionId: number,
orgId: number,
db: PrismaClient) => {
const bitbucketReposResult = await getBitbucketReposFromConfig(config);
const bitbucketRepos = bitbucketReposResult.repos;
const warnings = bitbucketReposResult.warnings;
const bitbucketReposResult = await getBitbucketReposFromConfig(config, orgId, db);
const bitbucketRepos = bitbucketReposResult.validRepos;
const notFound = bitbucketReposResult.notFound;
const hostUrl = config.url ?? 'https://bitbucket.org';
const repoNameRoot = new URL(hostUrl)
@ -390,7 +399,7 @@ export const compileBitbucketConfig = async (
const repos = bitbucketRepos.map((repo) => {
const isServer = config.deploymentType === 'server';
const codeHostType: CodeHostType = isServer ? 'bitbucketServer' : 'bitbucketCloud';
const codeHostType = isServer ? 'bitbucket-server' : 'bitbucket-cloud'; // zoekt expects bitbucket-server
const displayName = isServer ? (repo as BitbucketServerRepository).name! : (repo as BitbucketCloudRepository).full_name!;
const externalId = isServer ? (repo as BitbucketServerRepository).id!.toString() : (repo as BitbucketCloudRepository).uuid!;
const isPublic = isServer ? (repo as BitbucketServerRepository).public : (repo as BitbucketCloudRepository).is_private === false;
@ -413,7 +422,7 @@ export const compileBitbucketConfig = async (
isArchived: isArchived,
org: {
connect: {
id: SINGLE_TENANT_ORG_ID,
id: orgId,
},
},
connections: {
@ -423,8 +432,7 @@ export const compileBitbucketConfig = async (
},
metadata: {
gitConfig: {
// zoekt expects bitbucket-server and bitbucket-cloud
'zoekt.web-url-type': codeHostType === 'bitbucketServer' ? 'bitbucket-server' : 'bitbucket-cloud',
'zoekt.web-url-type': codeHostType,
'zoekt.web-url': webUrl,
'zoekt.name': repoName,
'zoekt.archived': marshalBool(isArchived),
@ -442,20 +450,21 @@ export const compileBitbucketConfig = async (
return {
repoData: repos,
warnings,
notFound,
};
}
export const compileGenericGitHostConfig = async (
config: GenericGitHostConnectionConfig,
connectionId: number
): Promise<CompileResult> => {
connectionId: number,
orgId: number,
) => {
const configUrl = new URL(config.url);
if (configUrl.protocol === 'file:') {
return compileGenericGitHostConfig_file(config, connectionId);
return compileGenericGitHostConfig_file(config, orgId, connectionId);
}
else if (configUrl.protocol === 'http:' || configUrl.protocol === 'https:') {
return compileGenericGitHostConfig_url(config, connectionId);
return compileGenericGitHostConfig_url(config, orgId, connectionId);
}
else {
// Schema should prevent this, but throw an error just in case.
@ -465,8 +474,9 @@ export const compileGenericGitHostConfig = async (
export const compileGenericGitHostConfig_file = async (
config: GenericGitHostConnectionConfig,
orgId: number,
connectionId: number,
): Promise<CompileResult> => {
) => {
const configUrl = new URL(config.url);
assert(configUrl.protocol === 'file:', 'config.url must be a file:// URL');
@ -476,24 +486,28 @@ export const compileGenericGitHostConfig_file = async (
});
const repos: RepoData[] = [];
const warnings: string[] = [];
await Promise.all(repoPaths.map((repoPath) => gitOperationLimit(async () => {
const isGitRepo = await isPathAValidGitRepoRoot({
path: repoPath,
});
const notFound: {
users: string[],
orgs: string[],
repos: string[],
} = {
users: [],
orgs: [],
repos: [],
};
await Promise.all(repoPaths.map(async (repoPath) => {
const isGitRepo = await isPathAValidGitRepoRoot(repoPath);
if (!isGitRepo) {
const warning = `Skipping ${repoPath} - not a git repository.`;
logger.warn(warning);
warnings.push(warning);
logger.warn(`Skipping ${repoPath} - not a git repository.`);
notFound.repos.push(repoPath);
return;
}
const origin = await getOriginUrl(repoPath);
if (!origin) {
const warning = `Skipping ${repoPath} - remote.origin.url not found in git config.`;
logger.warn(warning);
warnings.push(warning);
logger.warn(`Skipping ${repoPath} - remote.origin.url not found in git config.`);
notFound.repos.push(repoPath);
return;
}
@ -504,7 +518,7 @@ export const compileGenericGitHostConfig_file = async (
const repoName = path.join(remoteUrl.host, remoteUrl.pathname.replace(/\.git$/, ''));
const repo: RepoData = {
external_codeHostType: 'genericGitHost',
external_codeHostType: 'generic-git-host',
external_codeHostUrl: remoteUrl.resource,
external_id: remoteUrl.toString(),
cloneUrl: `file://${repoPath}`,
@ -514,7 +528,7 @@ export const compileGenericGitHostConfig_file = async (
isArchived: false,
org: {
connect: {
id: SINGLE_TENANT_ORG_ID,
id: orgId,
},
},
connections: {
@ -532,33 +546,40 @@ export const compileGenericGitHostConfig_file = async (
}
repos.push(repo);
})));
}));
return {
repoData: repos,
warnings,
notFound,
}
}
export const compileGenericGitHostConfig_url = async (
config: GenericGitHostConnectionConfig,
orgId: number,
connectionId: number,
): Promise<CompileResult> => {
) => {
const remoteUrl = new URL(config.url);
assert(remoteUrl.protocol === 'http:' || remoteUrl.protocol === 'https:', 'config.url must be a http:// or https:// URL');
const warnings: string[] = [];
const notFound: {
users: string[],
orgs: string[],
repos: string[],
} = {
users: [],
orgs: [],
repos: [],
};
// Validate that we are dealing with a valid git repo.
const isGitRepo = await isUrlAValidGitRepo(remoteUrl.toString());
if (!isGitRepo) {
const warning = `Skipping ${remoteUrl.toString()} - not a git repository.`;
logger.warn(warning);
warnings.push(warning);
notFound.repos.push(remoteUrl.toString());
return {
repoData: [],
warnings,
notFound,
}
}
@ -567,7 +588,7 @@ export const compileGenericGitHostConfig_url = async (
const repoName = path.join(remoteUrl.host, remoteUrl.pathname.replace(/\.git$/, ''));
const repo: RepoData = {
external_codeHostType: 'genericGitHost',
external_codeHostType: 'generic-git-host',
external_codeHostUrl: remoteUrl.origin,
external_id: remoteUrl.toString(),
cloneUrl: remoteUrl.toString(),
@ -577,7 +598,7 @@ export const compileGenericGitHostConfig_url = async (
isArchived: false,
org: {
connect: {
id: SINGLE_TENANT_ORG_ID,
id: orgId,
},
},
connections: {
@ -593,17 +614,20 @@ export const compileGenericGitHostConfig_url = async (
return {
repoData: [repo],
warnings,
notFound,
}
}
export const compileAzureDevOpsConfig = async (
config: AzureDevOpsConnectionConfig,
connectionId: number): Promise<CompileResult> => {
connectionId: number,
orgId: number,
db: PrismaClient,
abortController: AbortController) => {
const azureDevOpsReposResult = await getAzureDevOpsReposFromConfig(config);
const azureDevOpsRepos = azureDevOpsReposResult.repos;
const warnings = azureDevOpsReposResult.warnings;
const azureDevOpsReposResult = await getAzureDevOpsReposFromConfig(config, orgId, db);
const azureDevOpsRepos = azureDevOpsReposResult.validRepos;
const notFound = azureDevOpsReposResult.notFound;
const hostUrl = config.url ?? 'https://dev.azure.com';
const repoNameRoot = new URL(hostUrl)
@ -614,18 +638,18 @@ export const compileAzureDevOpsConfig = async (
if (!repo.project) {
throw new Error(`No project found for repository ${repo.name}`);
}
const repoDisplayName = `${repo.project.name}/${repo.name}`;
const repoName = path.join(repoNameRoot, repoDisplayName);
const isPublic = repo.project.visibility === ProjectVisibility.Public;
if (!repo.remoteUrl) {
throw new Error(`No remoteUrl found for repository ${repoDisplayName}`);
}
if (!repo.id) {
throw new Error(`No id found for repository ${repoDisplayName}`);
}
// Construct web URL for the repository
const webUrl = repo.webUrl || `${hostUrl}/${repo.project.name}/_git/${repo.name}`;
@ -645,7 +669,7 @@ export const compileAzureDevOpsConfig = async (
isPublic: isPublic,
org: {
connect: {
id: SINGLE_TENANT_ORG_ID,
id: orgId,
},
},
connections: {
@ -673,6 +697,6 @@ export const compileAzureDevOpsConfig = async (
return {
repoData: repos,
warnings,
notFound,
};
}

View file

@ -1,612 +0,0 @@
import * as Sentry from '@sentry/node';
import { PrismaClient, Repo, RepoIndexingJobStatus, RepoIndexingJobType } from "@sourcebot/db";
import { createLogger, Logger } from "@sourcebot/shared";
import { env, RepoIndexingJobMetadata, repoIndexingJobMetadataSchema, RepoMetadata, repoMetadataSchema } from '@sourcebot/shared';
import { existsSync } from 'fs';
import { readdir, rm } from 'fs/promises';
import { Job, Queue, ReservedJob, Worker } from "groupmq";
import { Redis } from 'ioredis';
import micromatch from 'micromatch';
import { GROUPMQ_WORKER_STOP_GRACEFUL_TIMEOUT_MS, INDEX_CACHE_DIR } from './constants.js';
import { cloneRepository, fetchRepository, getBranches, getCommitHashForRefName, getTags, isPathAValidGitRepoRoot, unsetGitConfig, upsertGitConfig } from './git.js';
import { captureEvent } from './posthog.js';
import { PromClient } from './promClient.js';
import { RepoWithConnections, Settings } from "./types.js";
import { getAuthCredentialsForRepo, getRepoPath, getShardPrefix, groupmqLifecycleExceptionWrapper, measure, setIntervalAsync } from './utils.js';
import { indexGitRepository } from './zoekt.js';
const LOG_TAG = 'repo-index-manager';
const logger = createLogger(LOG_TAG);
const createJobLogger = (jobId: string) => createLogger(`${LOG_TAG}:job:${jobId}`);
type JobPayload = {
type: 'INDEX' | 'CLEANUP';
jobId: string;
repoId: number;
repoName: string;
};
/**
* Manages the lifecycle of repository data on disk, including git working copies
* and search index shards. Handles both indexing operations (cloning/fetching repos
* and building search indexes) and cleanup operations (removing orphaned repos and
* their associated data).
*
* Uses a job queue system to process indexing and cleanup tasks asynchronously,
* with configurable concurrency limits and retry logic. Automatically schedules
* re-indexing of repos based on configured intervals and manages garbage collection
* of repos that are no longer connected to any source.
*/
export class RepoIndexManager {
private interval?: NodeJS.Timeout;
private queue: Queue<JobPayload>;
private worker: Worker<JobPayload>;
constructor(
private db: PrismaClient,
private settings: Settings,
private redis: Redis,
private promClient: PromClient,
) {
this.queue = new Queue<JobPayload>({
redis,
namespace: 'repo-index-queue',
jobTimeoutMs: this.settings.repoIndexTimeoutMs,
maxAttempts: 3,
logger: env.DEBUG_ENABLE_GROUPMQ_LOGGING === 'true',
});
this.worker = new Worker<JobPayload>({
queue: this.queue,
maxStalledCount: 1,
handler: this.runJob.bind(this),
concurrency: this.settings.maxRepoIndexingJobConcurrency,
...(env.DEBUG_ENABLE_GROUPMQ_LOGGING === 'true' ? {
logger: true,
} : {}),
});
this.worker.on('completed', this.onJobCompleted.bind(this));
this.worker.on('failed', this.onJobFailed.bind(this));
this.worker.on('stalled', this.onJobStalled.bind(this));
this.worker.on('error', this.onWorkerError.bind(this));
// graceful-timeout is triggered when a job is still processing after
// worker.close() is called and the timeout period has elapsed. In this case,
// we fail the job with no retry.
this.worker.on('graceful-timeout', this.onJobGracefulTimeout.bind(this));
}
public startScheduler() {
logger.debug('Starting scheduler');
this.interval = setIntervalAsync(async () => {
await this.scheduleIndexJobs();
await this.scheduleCleanupJobs();
}, this.settings.reindexRepoPollingIntervalMs);
this.worker.run();
}
private async scheduleIndexJobs() {
const thresholdDate = new Date(Date.now() - this.settings.reindexIntervalMs);
const timeoutDate = new Date(Date.now() - this.settings.repoIndexTimeoutMs);
const reposToIndex = await this.db.repo.findMany({
where: {
AND: [
{
OR: [
{ indexedAt: null },
{ indexedAt: { lt: thresholdDate } },
]
},
{
NOT: {
jobs: {
some: {
AND: [
{
type: RepoIndexingJobType.INDEX,
},
{
OR: [
// Don't schedule if there are active jobs that were created within the threshold date.
// This handles the case where a job is stuck in a pending state and will never be scheduled.
{
AND: [
{
status: {
in: [
RepoIndexingJobStatus.PENDING,
RepoIndexingJobStatus.IN_PROGRESS,
]
},
},
{
createdAt: {
gt: timeoutDate,
}
}
]
},
// Don't schedule if there are recent failed jobs (within the threshold date).
{
AND: [
{ status: RepoIndexingJobStatus.FAILED },
{ completedAt: { gt: thresholdDate } },
]
}
]
}
]
}
}
}
}
],
},
});
if (reposToIndex.length > 0) {
await this.createJobs(reposToIndex, RepoIndexingJobType.INDEX);
}
}
private async scheduleCleanupJobs() {
const gcGracePeriodMs = new Date(Date.now() - this.settings.repoGarbageCollectionGracePeriodMs);
const timeoutDate = new Date(Date.now() - this.settings.repoIndexTimeoutMs);
const reposToCleanup = await this.db.repo.findMany({
where: {
connections: {
none: {}
},
OR: [
{ indexedAt: null },
{ indexedAt: { lt: gcGracePeriodMs } },
],
NOT: {
jobs: {
some: {
AND: [
{
type: RepoIndexingJobType.CLEANUP,
},
{
status: {
in: [
RepoIndexingJobStatus.PENDING,
RepoIndexingJobStatus.IN_PROGRESS,
]
},
},
{
createdAt: {
gt: timeoutDate,
}
}
]
}
}
}
}
});
if (reposToCleanup.length > 0) {
await this.createJobs(reposToCleanup, RepoIndexingJobType.CLEANUP);
}
}
public async createJobs(repos: Repo[], type: RepoIndexingJobType) {
// @note: we don't perform this in a transaction because
// we want to avoid the situation where a job is created and run
// prior to the transaction being committed.
const jobs = await this.db.repoIndexingJob.createManyAndReturn({
data: repos.map(repo => ({
type,
repoId: repo.id,
})),
include: {
repo: true,
}
});
for (const job of jobs) {
await this.queue.add({
groupId: `repo:${job.repoId}`,
data: {
jobId: job.id,
type,
repoName: job.repo.name,
repoId: job.repo.id,
},
jobId: job.id,
});
const jobTypeLabel = getJobTypePrometheusLabel(type);
this.promClient.pendingRepoIndexJobs.inc({ repo: job.repo.name, type: jobTypeLabel });
}
return jobs.map(job => job.id);
}
private async runJob(job: ReservedJob<JobPayload>) {
const id = job.data.jobId;
const logger = createJobLogger(id);
logger.info(`Running ${job.data.type} job ${id} for repo ${job.data.repoName} (id: ${job.data.repoId}) (attempt ${job.attempts + 1} / ${job.maxAttempts})`);
const currentStatus = await this.db.repoIndexingJob.findUniqueOrThrow({
where: {
id,
},
select: {
status: true,
}
});
// Fail safe: if the job is not PENDING (first run) or IN_PROGRESS (retry), it indicates the job
// is in an invalid state and should be skipped.
if (
currentStatus.status !== RepoIndexingJobStatus.PENDING &&
currentStatus.status !== RepoIndexingJobStatus.IN_PROGRESS
) {
throw new Error(`Job ${id} is not in a valid state. Expected: ${RepoIndexingJobStatus.PENDING} or ${RepoIndexingJobStatus.IN_PROGRESS}. Actual: ${currentStatus.status}. Skipping.`);
}
const { repo, type: jobType } = await this.db.repoIndexingJob.update({
where: {
id,
},
data: {
status: RepoIndexingJobStatus.IN_PROGRESS,
},
select: {
type: true,
repo: {
include: {
connections: {
include: {
connection: true,
}
}
}
}
}
});
const jobTypeLabel = getJobTypePrometheusLabel(jobType);
this.promClient.pendingRepoIndexJobs.dec({ repo: job.data.repoName, type: jobTypeLabel });
this.promClient.activeRepoIndexJobs.inc({ repo: job.data.repoName, type: jobTypeLabel });
const abortController = new AbortController();
const signalHandler = () => {
logger.info(`Received shutdown signal, aborting...`);
abortController.abort(); // This cancels all operations
};
process.on('SIGTERM', signalHandler);
process.on('SIGINT', signalHandler);
try {
if (jobType === RepoIndexingJobType.INDEX) {
const revisions = await this.indexRepository(repo, logger, abortController.signal);
await this.db.repoIndexingJob.update({
where: { id },
data: {
metadata: {
indexedRevisions: revisions,
} satisfies RepoIndexingJobMetadata,
},
});
} else if (jobType === RepoIndexingJobType.CLEANUP) {
await this.cleanupRepository(repo, logger);
}
} finally {
process.off('SIGTERM', signalHandler);
process.off('SIGINT', signalHandler);
}
}
private async indexRepository(repo: RepoWithConnections, logger: Logger, signal: AbortSignal) {
const { path: repoPath, isReadOnly } = getRepoPath(repo);
const metadata = repoMetadataSchema.parse(repo.metadata);
const credentials = await getAuthCredentialsForRepo(repo);
const cloneUrlMaybeWithToken = credentials?.cloneUrlWithToken ?? repo.cloneUrl;
const authHeader = credentials?.authHeader ?? undefined;
// If the repo path exists but it is not a valid git repository root, this indicates
// that the repository is in a bad state. To fix, we remove the directory and perform
// a fresh clone.
if (existsSync(repoPath) && !(await isPathAValidGitRepoRoot({ path: repoPath }))) {
const isValidGitRepo = await isPathAValidGitRepoRoot({
path: repoPath,
signal,
});
if (!isValidGitRepo && !isReadOnly) {
logger.warn(`${repoPath} is not a valid git repository root. Deleting directory and performing fresh clone.`);
await rm(repoPath, { recursive: true, force: true });
}
}
if (existsSync(repoPath) && !isReadOnly) {
// @NOTE: in #483, we changed the cloning method s.t., we _no longer_
// write the clone URL (which could contain a auth token) to the
// `remote.origin.url` entry. For the upgrade scenario, we want
// to unset this key since it is no longer needed, hence this line.
// This will no-op if the key is already unset.
// @see: https://github.com/sourcebot-dev/sourcebot/pull/483
await unsetGitConfig({
path: repoPath,
keys: ["remote.origin.url"],
signal,
});
logger.info(`Fetching ${repo.name} (id: ${repo.id})...`);
const { durationMs } = await measure(() => fetchRepository({
cloneUrl: cloneUrlMaybeWithToken,
authHeader,
path: repoPath,
onProgress: ({ method, stage, progress }) => {
logger.debug(`git.${method} ${stage} stage ${progress}% complete for ${repo.name} (id: ${repo.id})`)
},
signal,
}));
const fetchDuration_s = durationMs / 1000;
process.stdout.write('\n');
logger.info(`Fetched ${repo.name} (id: ${repo.id}) in ${fetchDuration_s}s`);
} else if (!isReadOnly) {
logger.info(`Cloning ${repo.name} (id: ${repo.id})...`);
const { durationMs } = await measure(() => cloneRepository({
cloneUrl: cloneUrlMaybeWithToken,
authHeader,
path: repoPath,
onProgress: ({ method, stage, progress }) => {
logger.debug(`git.${method} ${stage} stage ${progress}% complete for ${repo.name} (id: ${repo.id})`)
},
signal
}));
const cloneDuration_s = durationMs / 1000;
process.stdout.write('\n');
logger.info(`Cloned ${repo.name} (id: ${repo.id}) in ${cloneDuration_s}s`);
}
// Regardless of clone or fetch, always upsert the git config for the repo.
// This ensures that the git config is always up to date for whatever we
// have in the DB.
if (metadata.gitConfig && !isReadOnly) {
await upsertGitConfig({
path: repoPath,
gitConfig: metadata.gitConfig,
signal,
});
}
let revisions = [
'HEAD'
];
if (metadata.branches) {
const branchGlobs = metadata.branches
const allBranches = await getBranches(repoPath);
const matchingBranches =
allBranches
.filter((branch) => micromatch.isMatch(branch, branchGlobs))
.map((branch) => `refs/heads/${branch}`);
revisions = [
...revisions,
...matchingBranches
];
}
if (metadata.tags) {
const tagGlobs = metadata.tags;
const allTags = await getTags(repoPath);
const matchingTags =
allTags
.filter((tag) => micromatch.isMatch(tag, tagGlobs))
.map((tag) => `refs/tags/${tag}`);
revisions = [
...revisions,
...matchingTags
];
}
// zoekt has a limit of 64 branches/tags to index.
if (revisions.length > 64) {
logger.warn(`Too many revisions (${revisions.length}) for repo ${repo.id}, truncating to 64`);
captureEvent('backend_revisions_truncated', {
repoId: repo.id,
revisionCount: revisions.length,
});
revisions = revisions.slice(0, 64);
}
logger.info(`Indexing ${repo.name} (id: ${repo.id})...`);
const { durationMs } = await measure(() => indexGitRepository(repo, this.settings, revisions, signal));
const indexDuration_s = durationMs / 1000;
logger.info(`Indexed ${repo.name} (id: ${repo.id}) in ${indexDuration_s}s`);
return revisions;
}
private async cleanupRepository(repo: Repo, logger: Logger) {
const { path: repoPath, isReadOnly } = getRepoPath(repo);
if (existsSync(repoPath) && !isReadOnly) {
logger.info(`Deleting repo directory ${repoPath}`);
await rm(repoPath, { recursive: true, force: true });
}
const shardPrefix = getShardPrefix(repo.orgId, repo.id);
const files = (await readdir(INDEX_CACHE_DIR)).filter(file => file.startsWith(shardPrefix));
for (const file of files) {
const filePath = `${INDEX_CACHE_DIR}/${file}`;
logger.info(`Deleting shard file ${filePath}`);
await rm(filePath, { force: true });
}
}
private onJobCompleted = async (job: Job<JobPayload>) =>
groupmqLifecycleExceptionWrapper('onJobCompleted', logger, async () => {
const logger = createJobLogger(job.data.jobId);
const jobData = await this.db.repoIndexingJob.update({
where: { id: job.data.jobId },
data: {
status: RepoIndexingJobStatus.COMPLETED,
completedAt: new Date(),
},
include: {
repo: true,
}
});
const jobTypeLabel = getJobTypePrometheusLabel(jobData.type);
if (jobData.type === RepoIndexingJobType.INDEX) {
const { path: repoPath } = getRepoPath(jobData.repo);
const commitHash = await getCommitHashForRefName({
path: repoPath,
refName: 'HEAD',
});
const jobMetadata = repoIndexingJobMetadataSchema.parse(jobData.metadata);
const repo = await this.db.repo.update({
where: { id: jobData.repoId },
data: {
indexedAt: new Date(),
indexedCommitHash: commitHash,
metadata: {
...(jobData.repo.metadata as RepoMetadata),
indexedRevisions: jobMetadata.indexedRevisions,
} satisfies RepoMetadata,
}
});
logger.info(`Completed index job ${job.data.jobId} for repo ${repo.name} (id: ${repo.id})`);
}
else if (jobData.type === RepoIndexingJobType.CLEANUP) {
const repo = await this.db.repo.delete({
where: { id: jobData.repoId },
});
logger.info(`Completed cleanup job ${job.data.jobId} for repo ${repo.name} (id: ${repo.id})`);
}
// Track metrics for successful job
this.promClient.activeRepoIndexJobs.dec({ repo: job.data.repoName, type: jobTypeLabel });
this.promClient.repoIndexJobSuccessTotal.inc({ repo: job.data.repoName, type: jobTypeLabel });
});
private onJobFailed = async (job: Job<JobPayload>) =>
groupmqLifecycleExceptionWrapper('onJobFailed', logger, async () => {
const logger = createJobLogger(job.data.jobId);
const attempt = job.attemptsMade + 1;
const wasLastAttempt = attempt >= job.opts.attempts;
const jobTypeLabel = getJobTypePrometheusLabel(job.data.type);
if (wasLastAttempt) {
const { repo } = await this.db.repoIndexingJob.update({
where: { id: job.data.jobId },
data: {
status: RepoIndexingJobStatus.FAILED,
completedAt: new Date(),
errorMessage: job.failedReason,
},
select: { repo: true }
});
this.promClient.activeRepoIndexJobs.dec({ repo: job.data.repoName, type: jobTypeLabel });
this.promClient.repoIndexJobFailTotal.inc({ repo: job.data.repoName, type: jobTypeLabel });
logger.error(`Failed job ${job.data.jobId} for repo ${repo.name} (id: ${repo.id}). Attempt ${attempt} / ${job.opts.attempts}. Failing job.`);
} else {
const repo = await this.db.repo.findUniqueOrThrow({
where: { id: job.data.repoId },
});
this.promClient.repoIndexJobReattemptsTotal.inc({ repo: job.data.repoName, type: jobTypeLabel });
logger.warn(`Failed job ${job.data.jobId} for repo ${repo.name} (id: ${repo.id}). Attempt ${attempt} / ${job.opts.attempts}. Retrying.`);
}
});
private onJobStalled = async (jobId: string) =>
groupmqLifecycleExceptionWrapper('onJobStalled', logger, async () => {
const logger = createJobLogger(jobId);
const { repo, type } = await this.db.repoIndexingJob.update({
where: { id: jobId },
data: {
status: RepoIndexingJobStatus.FAILED,
completedAt: new Date(),
errorMessage: 'Job stalled',
},
select: { repo: true, type: true }
});
const jobTypeLabel = getJobTypePrometheusLabel(type);
this.promClient.activeRepoIndexJobs.dec({ repo: repo.name, type: jobTypeLabel });
this.promClient.repoIndexJobFailTotal.inc({ repo: repo.name, type: jobTypeLabel });
logger.error(`Job ${jobId} stalled for repo ${repo.name} (id: ${repo.id})`);
});
private onJobGracefulTimeout = async (job: Job<JobPayload>) =>
groupmqLifecycleExceptionWrapper('onJobGracefulTimeout', logger, async () => {
const logger = createJobLogger(job.data.jobId);
const jobTypeLabel = getJobTypePrometheusLabel(job.data.type);
const { repo } = await this.db.repoIndexingJob.update({
where: { id: job.data.jobId },
data: {
status: RepoIndexingJobStatus.FAILED,
completedAt: new Date(),
errorMessage: 'Job timed out',
},
select: { repo: true }
});
this.promClient.activeRepoIndexJobs.dec({ repo: job.data.repoName, type: jobTypeLabel });
this.promClient.repoIndexJobFailTotal.inc({ repo: job.data.repoName, type: jobTypeLabel });
logger.error(`Job ${job.data.jobId} timed out for repo ${repo.name} (id: ${repo.id}). Failing job.`);
});
private async onWorkerError(error: Error) {
Sentry.captureException(error);
logger.error(`Index syncer worker error.`, error);
}
public async dispose() {
if (this.interval) {
clearInterval(this.interval);
}
const inProgressJobs = this.worker.getCurrentJobs();
await this.worker.close(GROUPMQ_WORKER_STOP_GRACEFUL_TIMEOUT_MS);
// Manually release group locks for in progress jobs to prevent deadlocks.
// @see: https://github.com/Openpanel-dev/groupmq/issues/8
for (const { job } of inProgressJobs) {
const lockKey = `groupmq:repo-index-queue:lock:${job.groupId}`;
logger.debug(`Releasing group lock ${lockKey} for in progress job ${job.id}`);
await this.redis.del(lockKey);
}
// @note: As of groupmq v1.0.0, queue.close() will just close the underlying
// redis connection. Since we share the same redis client between, skip this
// step and close the redis client directly in index.ts.
// await this.queue.close();
}
}
const getJobTypePrometheusLabel = (type: RepoIndexingJobType) => type === RepoIndexingJobType.INDEX ? 'index' : 'cleanup';

View file

@ -0,0 +1,566 @@
import * as Sentry from "@sentry/node";
import { PrismaClient, Repo, RepoIndexingStatus, StripeSubscriptionStatus } from "@sourcebot/db";
import { createLogger } from "@sourcebot/logger";
import { Job, Queue, Worker } from 'bullmq';
import { existsSync, promises, readdirSync } from 'fs';
import { Redis } from 'ioredis';
import { env } from './env.js';
import { cloneRepository, fetchRepository, unsetGitConfig, upsertGitConfig } from "./git.js";
import { PromClient } from './promClient.js';
import { AppContext, RepoWithConnections, Settings, repoMetadataSchema } from "./types.js";
import { getAuthCredentialsForRepo, getRepoPath, getShardPrefix, measure } from "./utils.js";
import { indexGitRepository } from "./zoekt.js";
const REPO_INDEXING_QUEUE = 'repoIndexingQueue';
const REPO_GC_QUEUE = 'repoGarbageCollectionQueue';
type RepoIndexingPayload = {
repo: RepoWithConnections,
}
type RepoGarbageCollectionPayload = {
repo: Repo,
}
const logger = createLogger('repo-manager');
export class RepoManager {
private indexWorker: Worker;
private indexQueue: Queue<RepoIndexingPayload>;
private gcWorker: Worker;
private gcQueue: Queue<RepoGarbageCollectionPayload>;
private interval?: NodeJS.Timeout;
constructor(
private db: PrismaClient,
private settings: Settings,
redis: Redis,
private promClient: PromClient,
private ctx: AppContext,
) {
// Repo indexing
this.indexQueue = new Queue<RepoIndexingPayload>(REPO_INDEXING_QUEUE, {
connection: redis,
});
this.indexWorker = new Worker(REPO_INDEXING_QUEUE, this.runIndexJob.bind(this), {
connection: redis,
concurrency: this.settings.maxRepoIndexingJobConcurrency,
});
this.indexWorker.on('completed', this.onIndexJobCompleted.bind(this));
this.indexWorker.on('failed', this.onIndexJobFailed.bind(this));
// Garbage collection
this.gcQueue = new Queue<RepoGarbageCollectionPayload>(REPO_GC_QUEUE, {
connection: redis,
});
this.gcWorker = new Worker(REPO_GC_QUEUE, this.runGarbageCollectionJob.bind(this), {
connection: redis,
concurrency: this.settings.maxRepoGarbageCollectionJobConcurrency,
});
this.gcWorker.on('completed', this.onGarbageCollectionJobCompleted.bind(this));
this.gcWorker.on('failed', this.onGarbageCollectionJobFailed.bind(this));
}
public startScheduler() {
logger.debug('Starting scheduler');
this.interval = setInterval(async () => {
await this.fetchAndScheduleRepoIndexing();
await this.fetchAndScheduleRepoGarbageCollection();
await this.fetchAndScheduleRepoTimeouts();
}, this.settings.reindexRepoPollingIntervalMs);
}
///////////////////////////
// Repo indexing
///////////////////////////
private async scheduleRepoIndexingBulk(repos: RepoWithConnections[]) {
await this.db.$transaction(async (tx) => {
await tx.repo.updateMany({
where: { id: { in: repos.map(repo => repo.id) } },
data: { repoIndexingStatus: RepoIndexingStatus.IN_INDEX_QUEUE }
});
const reposByOrg = repos.reduce<Record<number, RepoWithConnections[]>>((acc, repo) => {
if (!acc[repo.orgId]) {
acc[repo.orgId] = [];
}
acc[repo.orgId].push(repo);
return acc;
}, {});
for (const orgId in reposByOrg) {
const orgRepos = reposByOrg[orgId];
// Set priority based on number of repos (more repos = lower priority)
// This helps prevent large orgs from overwhelming the indexQueue
const priority = Math.min(Math.ceil(orgRepos.length / 10), 2097152);
await this.indexQueue.addBulk(orgRepos.map(repo => ({
name: 'repoIndexJob',
data: { repo },
opts: {
priority: priority,
removeOnComplete: env.REDIS_REMOVE_ON_COMPLETE,
removeOnFail: env.REDIS_REMOVE_ON_FAIL,
},
})));
// Increment pending jobs counter for each repo added
orgRepos.forEach(repo => {
this.promClient.pendingRepoIndexingJobs.inc({ repo: repo.id.toString() });
});
logger.info(`Added ${orgRepos.length} jobs to indexQueue for org ${orgId} with priority ${priority}`);
}
}).catch((err: unknown) => {
logger.error(`Failed to add jobs to indexQueue for repos ${repos.map(repo => repo.id).join(', ')}: ${err}`);
});
}
private async fetchAndScheduleRepoIndexing() {
const thresholdDate = new Date(Date.now() - this.settings.reindexIntervalMs);
const repos = await this.db.repo.findMany({
where: {
OR: [
// "NEW" is really a misnomer here - it just means that the repo needs to be indexed
// immediately. In most cases, this will be because the repo was just created and
// is indeed "new". However, it could also be that a "retry" was requested on a failed
// index. So, we don't want to block on the indexedAt timestamp here.
{
repoIndexingStatus: RepoIndexingStatus.NEW,
},
// When the repo has already been indexed, we only want to reindex if the reindexing
// interval has elapsed (or if the date isn't set for some reason).
{
AND: [
{ repoIndexingStatus: RepoIndexingStatus.INDEXED },
{
OR: [
{ indexedAt: null },
{ indexedAt: { lt: thresholdDate } },
]
}
]
}
]
},
include: {
connections: {
include: {
connection: true
}
}
}
});
if (repos.length > 0) {
await this.scheduleRepoIndexingBulk(repos);
}
}
private async syncGitRepository(repo: RepoWithConnections, repoAlreadyInIndexingState: boolean) {
const { path: repoPath, isReadOnly } = getRepoPath(repo, this.ctx);
const metadata = repoMetadataSchema.parse(repo.metadata);
// If the repo was already in the indexing state, this job was likely killed and picked up again. As a result,
// to ensure the repo state is valid, we delete the repo if it exists so we get a fresh clone
if (repoAlreadyInIndexingState && existsSync(repoPath) && !isReadOnly) {
logger.info(`Deleting repo directory ${repoPath} during sync because it was already in the indexing state`);
await promises.rm(repoPath, { recursive: true, force: true });
}
const credentials = await getAuthCredentialsForRepo(repo, this.db);
const cloneUrlMaybeWithToken = credentials?.cloneUrlWithToken ?? repo.cloneUrl;
const authHeader = credentials?.authHeader ?? undefined;
if (existsSync(repoPath) && !isReadOnly) {
// @NOTE: in #483, we changed the cloning method s.t., we _no longer_
// write the clone URL (which could contain a auth token) to the
// `remote.origin.url` entry. For the upgrade scenario, we want
// to unset this key since it is no longer needed, hence this line.
// This will no-op if the key is already unset.
// @see: https://github.com/sourcebot-dev/sourcebot/pull/483
await unsetGitConfig(repoPath, ["remote.origin.url"]);
logger.info(`Fetching ${repo.displayName}...`);
const { durationMs } = await measure(() => fetchRepository({
cloneUrl: cloneUrlMaybeWithToken,
authHeader,
path: repoPath,
onProgress: ({ method, stage, progress }) => {
logger.debug(`git.${method} ${stage} stage ${progress}% complete for ${repo.displayName}`)
}
}));
const fetchDuration_s = durationMs / 1000;
process.stdout.write('\n');
logger.info(`Fetched ${repo.displayName} in ${fetchDuration_s}s`);
} else if (!isReadOnly) {
logger.info(`Cloning ${repo.displayName}...`);
const { durationMs } = await measure(() => cloneRepository({
cloneUrl: cloneUrlMaybeWithToken,
authHeader,
path: repoPath,
onProgress: ({ method, stage, progress }) => {
logger.debug(`git.${method} ${stage} stage ${progress}% complete for ${repo.displayName}`)
}
}));
const cloneDuration_s = durationMs / 1000;
process.stdout.write('\n');
logger.info(`Cloned ${repo.displayName} in ${cloneDuration_s}s`);
}
// Regardless of clone or fetch, always upsert the git config for the repo.
// This ensures that the git config is always up to date for whatever we
// have in the DB.
if (metadata.gitConfig && !isReadOnly) {
await upsertGitConfig(repoPath, metadata.gitConfig);
}
logger.info(`Indexing ${repo.displayName}...`);
const { durationMs } = await measure(() => indexGitRepository(repo, this.settings, this.ctx));
const indexDuration_s = durationMs / 1000;
logger.info(`Indexed ${repo.displayName} in ${indexDuration_s}s`);
}
private async runIndexJob(job: Job<RepoIndexingPayload>) {
logger.info(`Running index job (id: ${job.id}) for repo ${job.data.repo.displayName}`);
const repo = job.data.repo as RepoWithConnections;
// We have to use the existing repo object to get the repoIndexingStatus because the repo object
// inside the job is unchanged from when it was added to the queue.
const existingRepo = await this.db.repo.findUnique({
where: {
id: repo.id,
},
});
if (!existingRepo) {
logger.error(`Repo ${repo.id} not found`);
const e = new Error(`Repo ${repo.id} not found`);
Sentry.captureException(e);
throw e;
}
const repoAlreadyInIndexingState = existingRepo.repoIndexingStatus === RepoIndexingStatus.INDEXING;
await this.db.repo.update({
where: {
id: repo.id,
},
data: {
repoIndexingStatus: RepoIndexingStatus.INDEXING,
}
});
this.promClient.activeRepoIndexingJobs.inc();
this.promClient.pendingRepoIndexingJobs.dec({ repo: repo.id.toString() });
let attempts = 0;
const maxAttempts = 3;
while (attempts < maxAttempts) {
try {
await this.syncGitRepository(repo, repoAlreadyInIndexingState);
break;
} catch (error) {
Sentry.captureException(error);
attempts++;
this.promClient.repoIndexingReattemptsTotal.inc();
if (attempts === maxAttempts) {
logger.error(`Failed to sync repository ${repo.name} (id: ${repo.id}) after ${maxAttempts} attempts. Error: ${error}`);
throw error;
}
const sleepDuration = (env.REPO_SYNC_RETRY_BASE_SLEEP_SECONDS * 1000) * Math.pow(2, attempts - 1);
logger.error(`Failed to sync repository ${repo.name} (id: ${repo.id}), attempt ${attempts}/${maxAttempts}. Sleeping for ${sleepDuration / 1000}s... Error: ${error}`);
await new Promise(resolve => setTimeout(resolve, sleepDuration));
}
}
}
private async onIndexJobCompleted(job: Job<RepoIndexingPayload>) {
logger.info(`Repo index job for repo ${job.data.repo.displayName} (id: ${job.data.repo.id}, jobId: ${job.id}) completed`);
this.promClient.activeRepoIndexingJobs.dec();
this.promClient.repoIndexingSuccessTotal.inc();
await this.db.repo.update({
where: {
id: job.data.repo.id,
},
data: {
indexedAt: new Date(),
repoIndexingStatus: RepoIndexingStatus.INDEXED,
}
});
}
private async onIndexJobFailed(job: Job<RepoIndexingPayload> | undefined, err: unknown) {
logger.info(`Repo index job for repo ${job?.data.repo.displayName} (id: ${job?.data.repo.id}, jobId: ${job?.id}) failed with error: ${err}`);
Sentry.captureException(err, {
tags: {
repoId: job?.data.repo.id,
jobId: job?.id,
queue: REPO_INDEXING_QUEUE,
}
});
if (job) {
this.promClient.activeRepoIndexingJobs.dec();
this.promClient.repoIndexingFailTotal.inc();
await this.db.repo.update({
where: {
id: job.data.repo.id,
},
data: {
repoIndexingStatus: RepoIndexingStatus.FAILED,
}
})
}
}
///////////////////////////
// Repo garbage collection
///////////////////////////
private async scheduleRepoGarbageCollectionBulk(repos: Repo[]) {
await this.db.$transaction(async (tx) => {
await tx.repo.updateMany({
where: { id: { in: repos.map(repo => repo.id) } },
data: { repoIndexingStatus: RepoIndexingStatus.IN_GC_QUEUE }
});
await this.gcQueue.addBulk(repos.map(repo => ({
name: 'repoGarbageCollectionJob',
data: { repo },
opts: {
removeOnComplete: env.REDIS_REMOVE_ON_COMPLETE,
removeOnFail: env.REDIS_REMOVE_ON_FAIL,
}
})));
logger.info(`Added ${repos.length} jobs to gcQueue`);
});
}
private async fetchAndScheduleRepoGarbageCollection() {
////////////////////////////////////
// Get repos with no connections
////////////////////////////////////
const thresholdDate = new Date(Date.now() - this.settings.repoGarbageCollectionGracePeriodMs);
const reposWithNoConnections = await this.db.repo.findMany({
where: {
repoIndexingStatus: {
in: [
RepoIndexingStatus.INDEXED, // we don't include NEW repos here because they'll be picked up by the index queue (potential race condition)
RepoIndexingStatus.FAILED,
]
},
connections: {
none: {}
},
OR: [
{ indexedAt: null },
{ indexedAt: { lt: thresholdDate } }
]
},
});
if (reposWithNoConnections.length > 0) {
logger.info(`Garbage collecting ${reposWithNoConnections.length} repos with no connections: ${reposWithNoConnections.map(repo => repo.id).join(', ')}`);
}
////////////////////////////////////
// Get inactive org repos
////////////////////////////////////
const sevenDaysAgo = new Date(Date.now() - 7 * 24 * 60 * 60 * 1000);
const inactiveOrgRepos = await this.db.repo.findMany({
where: {
org: {
stripeSubscriptionStatus: StripeSubscriptionStatus.INACTIVE,
stripeLastUpdatedAt: {
lt: sevenDaysAgo
}
},
OR: [
{ indexedAt: null },
{ indexedAt: { lt: thresholdDate } }
]
}
});
if (inactiveOrgRepos.length > 0) {
logger.info(`Garbage collecting ${inactiveOrgRepos.length} inactive org repos: ${inactiveOrgRepos.map(repo => repo.id).join(', ')}`);
}
const reposToDelete = [...reposWithNoConnections, ...inactiveOrgRepos];
if (reposToDelete.length > 0) {
await this.scheduleRepoGarbageCollectionBulk(reposToDelete);
}
}
private async runGarbageCollectionJob(job: Job<RepoGarbageCollectionPayload>) {
logger.info(`Running garbage collection job (id: ${job.id}) for repo ${job.data.repo.displayName} (id: ${job.data.repo.id})`);
this.promClient.activeRepoGarbageCollectionJobs.inc();
const repo = job.data.repo as Repo;
await this.db.repo.update({
where: {
id: repo.id
},
data: {
repoIndexingStatus: RepoIndexingStatus.GARBAGE_COLLECTING
}
});
// delete cloned repo
const { path: repoPath, isReadOnly } = getRepoPath(repo, this.ctx);
if (existsSync(repoPath) && !isReadOnly) {
logger.info(`Deleting repo directory ${repoPath}`);
await promises.rm(repoPath, { recursive: true, force: true });
}
// delete shards
const shardPrefix = getShardPrefix(repo.orgId, repo.id);
const files = readdirSync(this.ctx.indexPath).filter(file => file.startsWith(shardPrefix));
for (const file of files) {
const filePath = `${this.ctx.indexPath}/${file}`;
logger.info(`Deleting shard file ${filePath}`);
await promises.rm(filePath, { force: true });
}
}
private async onGarbageCollectionJobCompleted(job: Job<RepoGarbageCollectionPayload>) {
logger.info(`Garbage collection job ${job.id} completed`);
this.promClient.activeRepoGarbageCollectionJobs.dec();
this.promClient.repoGarbageCollectionSuccessTotal.inc();
await this.db.repo.delete({
where: {
id: job.data.repo.id
}
});
}
private async onGarbageCollectionJobFailed(job: Job<RepoGarbageCollectionPayload> | undefined, err: unknown) {
logger.info(`Garbage collection job failed (id: ${job?.id ?? 'unknown'}) with error: ${err}`);
Sentry.captureException(err, {
tags: {
repoId: job?.data.repo.id,
jobId: job?.id,
queue: REPO_GC_QUEUE,
}
});
if (job) {
this.promClient.activeRepoGarbageCollectionJobs.dec();
this.promClient.repoGarbageCollectionFailTotal.inc();
await this.db.repo.update({
where: {
id: job.data.repo.id
},
data: {
repoIndexingStatus: RepoIndexingStatus.GARBAGE_COLLECTION_FAILED
}
});
}
}
///////////////////////////
// Repo index validation
///////////////////////////
public async validateIndexedReposHaveShards() {
logger.info('Validating indexed repos have shards...');
const indexedRepos = await this.db.repo.findMany({
where: {
repoIndexingStatus: RepoIndexingStatus.INDEXED
}
});
logger.info(`Found ${indexedRepos.length} repos in the DB marked as INDEXED`);
if (indexedRepos.length === 0) {
return;
}
const files = readdirSync(this.ctx.indexPath);
const reposToReindex: number[] = [];
for (const repo of indexedRepos) {
const shardPrefix = getShardPrefix(repo.orgId, repo.id);
// TODO: this doesn't take into account if a repo has multiple shards and only some of them are missing. To support that, this logic
// would need to know how many total shards are expected for this repo
let hasShards = false;
try {
hasShards = files.some(file => file.startsWith(shardPrefix));
} catch (error) {
logger.error(`Failed to read index directory ${this.ctx.indexPath}: ${error}`);
continue;
}
if (!hasShards) {
logger.info(`Repo ${repo.displayName} (id: ${repo.id}) is marked as INDEXED but has no shards on disk. Marking for reindexing.`);
reposToReindex.push(repo.id);
}
}
if (reposToReindex.length > 0) {
await this.db.repo.updateMany({
where: {
id: { in: reposToReindex }
},
data: {
repoIndexingStatus: RepoIndexingStatus.NEW
}
});
logger.info(`Marked ${reposToReindex.length} repos for reindexing due to missing shards`);
}
logger.info('Done validating indexed repos have shards');
}
private async fetchAndScheduleRepoTimeouts() {
const repos = await this.db.repo.findMany({
where: {
repoIndexingStatus: RepoIndexingStatus.INDEXING,
updatedAt: {
lt: new Date(Date.now() - this.settings.repoIndexTimeoutMs)
}
}
});
if (repos.length > 0) {
logger.info(`Scheduling ${repos.length} repo timeouts`);
await this.scheduleRepoTimeoutsBulk(repos);
}
}
private async scheduleRepoTimeoutsBulk(repos: Repo[]) {
await this.db.$transaction(async (tx) => {
await tx.repo.updateMany({
where: { id: { in: repos.map(repo => repo.id) } },
data: { repoIndexingStatus: RepoIndexingStatus.FAILED }
});
});
}
public async dispose() {
if (this.interval) {
clearInterval(this.interval);
}
this.indexWorker.close();
this.indexQueue.close();
this.gcQueue.close();
this.gcWorker.close();
}
}

View file

@ -1,8 +1,50 @@
import { Connection, Repo, RepoToConnection } from "@sourcebot/db";
import { Settings as SettingsSchema } from "@sourcebot/schemas/v3/index.type";
import { z } from "zod";
export type AppContext = {
/**
* Path to the repos cache directory.
*/
reposPath: string;
/**
* Path to the index cache directory;
*/
indexPath: string;
cachePath: string;
}
export type Settings = Required<SettingsSchema>;
// Structure of the `metadata` field in the `Repo` table.
//
// @WARNING: If you modify this schema, please make sure it is backwards
// compatible with any prior versions of the schema!!
// @NOTE: If you move this schema, please update the comment in schema.prisma
// to point to the new location.
export const repoMetadataSchema = z.object({
/**
* A set of key-value pairs that will be used as git config
* variables when cloning the repo.
* @see: https://git-scm.com/docs/git-clone#Documentation/git-clone.txt-code--configcodecodeltkeygtltvaluegtcode
*/
gitConfig: z.record(z.string(), z.string()).optional(),
/**
* A list of branches to index. Glob patterns are supported.
*/
branches: z.array(z.string()).optional(),
/**
* A list of tags to index. Glob patterns are supported.
*/
tags: z.array(z.string()).optional(),
});
export type RepoMetadata = z.infer<typeof repoMetadataSchema>;
// @see : https://stackoverflow.com/a/61132308
export type DeepPartial<T> = T extends object ? {
[P in keyof T]?: DeepPartial<T[P]>;

View file

@ -1,13 +1,11 @@
import { Logger } from "winston";
import { RepoAuthCredentials, RepoWithConnections } from "./types.js";
import { AppContext, RepoAuthCredentials, RepoWithConnections } from "./types.js";
import path from 'path';
import { Repo } from "@sourcebot/db";
import { getTokenFromConfig } from "@sourcebot/shared";
import { PrismaClient, Repo } from "@sourcebot/db";
import { getTokenFromConfig as getTokenFromConfigBase } from "@sourcebot/crypto";
import { BackendException, BackendError } from "@sourcebot/error";
import * as Sentry from "@sentry/node";
import { GithubConnectionConfig, GitlabConnectionConfig, GiteaConnectionConfig, BitbucketConnectionConfig, AzureDevOpsConnectionConfig } from '@sourcebot/schemas/v3/connection.type';
import { GithubAppManager } from "./ee/githubAppManager.js";
import { hasEntitlement } from "@sourcebot/shared";
import { REPOS_CACHE_DIR } from "./constants.js";
export const measure = async <T>(cb: () => Promise<T>) => {
const start = Date.now();
@ -23,6 +21,22 @@ export const marshalBool = (value?: boolean) => {
return !!value ? '1' : '0';
}
export const getTokenFromConfig = async (token: any, orgId: number, db: PrismaClient, logger?: Logger) => {
try {
return await getTokenFromConfigBase(token, orgId, db);
} catch (error: unknown) {
if (error instanceof Error) {
const e = new BackendException(BackendError.CONNECTION_SYNC_SECRET_DNE, {
message: error.message,
});
Sentry.captureException(e);
logger?.error(error.message);
throw e;
}
throw error;
}
};
export const resolvePathRelativeToConfig = (localPath: string, configPath: string) => {
let absolutePath = localPath;
if (!path.isAbsolute(absolutePath)) {
@ -55,11 +69,11 @@ export const arraysEqualShallow = <T>(a?: readonly T[], b?: readonly T[]) => {
// @note: this function is duplicated in `packages/web/src/features/fileTree/actions.ts`.
// @todo: we should move this to a shared package.
export const getRepoPath = (repo: Repo): { path: string, isReadOnly: boolean } => {
export const getRepoPath = (repo: Repo, ctx: AppContext): { path: string, isReadOnly: boolean } => {
// If we are dealing with a local repository, then use that as the path.
// Mark as read-only since we aren't guaranteed to have write access to the local filesystem.
const cloneUrl = new URL(repo.cloneUrl);
if (repo.external_codeHostType === 'genericGitHost' && cloneUrl.protocol === 'file:') {
if (repo.external_codeHostType === 'generic-git-host' && cloneUrl.protocol === 'file:') {
return {
path: cloneUrl.pathname,
isReadOnly: true,
@ -67,7 +81,7 @@ export const getRepoPath = (repo: Repo): { path: string, isReadOnly: boolean } =
}
return {
path: path.join(REPOS_CACHE_DIR, repo.id.toString()),
path: path.join(ctx.reposPath, repo.id.toString()),
isReadOnly: false,
}
}
@ -110,36 +124,12 @@ export const fetchWithRetry = async <T>(
// fetch the token here using the connections from the repo. Multiple connections could be referencing this repo, and each
// may have their own token. This method will just pick the first connection that has a token (if one exists) and uses that. This
// may technically cause syncing to fail if that connection's token just so happens to not have access to the repo it's referencing.
export const getAuthCredentialsForRepo = async (repo: RepoWithConnections, logger?: Logger): Promise<RepoAuthCredentials | undefined> => {
// If we have github apps configured we assume that we must use them for github service auth
if (repo.external_codeHostType === 'github' && hasEntitlement('github-app') && GithubAppManager.getInstance().appsConfigured()) {
logger?.debug(`Using GitHub App for service auth for repo ${repo.displayName} hosted at ${repo.external_codeHostUrl}`);
const owner = repo.displayName?.split('/')[0];
const deploymentHostname = new URL(repo.external_codeHostUrl).hostname;
if (!owner || !deploymentHostname) {
throw new Error(`Failed to fetch GitHub App for repo ${repo.displayName}:Invalid repo displayName (${repo.displayName}) or deployment hostname (${deploymentHostname})`);
}
const token = await GithubAppManager.getInstance().getInstallationToken(owner, deploymentHostname);
return {
hostUrl: repo.external_codeHostUrl,
token,
cloneUrlWithToken: createGitCloneUrlWithToken(
repo.cloneUrl,
{
username: 'x-access-token',
password: token
}
),
}
}
export const getAuthCredentialsForRepo = async (repo: RepoWithConnections, db: PrismaClient, logger?: Logger): Promise<RepoAuthCredentials | undefined> => {
for (const { connection } of repo.connections) {
if (connection.connectionType === 'github') {
const config = connection.config as unknown as GithubConnectionConfig;
if (config.token) {
const token = await getTokenFromConfig(config.token);
const token = await getTokenFromConfig(config.token, connection.orgId, db, logger);
return {
hostUrl: config.url,
token,
@ -154,7 +144,7 @@ export const getAuthCredentialsForRepo = async (repo: RepoWithConnections, logge
} else if (connection.connectionType === 'gitlab') {
const config = connection.config as unknown as GitlabConnectionConfig;
if (config.token) {
const token = await getTokenFromConfig(config.token);
const token = await getTokenFromConfig(config.token, connection.orgId, db, logger);
return {
hostUrl: config.url,
token,
@ -170,7 +160,7 @@ export const getAuthCredentialsForRepo = async (repo: RepoWithConnections, logge
} else if (connection.connectionType === 'gitea') {
const config = connection.config as unknown as GiteaConnectionConfig;
if (config.token) {
const token = await getTokenFromConfig(config.token);
const token = await getTokenFromConfig(config.token, connection.orgId, db, logger);
return {
hostUrl: config.url,
token,
@ -185,7 +175,7 @@ export const getAuthCredentialsForRepo = async (repo: RepoWithConnections, logge
} else if (connection.connectionType === 'bitbucket') {
const config = connection.config as unknown as BitbucketConnectionConfig;
if (config.token) {
const token = await getTokenFromConfig(config.token);
const token = await getTokenFromConfig(config.token, connection.orgId, db, logger);
const username = config.user ?? 'x-token-auth';
return {
hostUrl: config.url,
@ -202,7 +192,7 @@ export const getAuthCredentialsForRepo = async (repo: RepoWithConnections, logge
} else if (connection.connectionType === 'azuredevops') {
const config = connection.config as unknown as AzureDevOpsConnectionConfig;
if (config.token) {
const token = await getTokenFromConfig(config.token);
const token = await getTokenFromConfig(config.token, connection.orgId, db, logger);
// For ADO server, multiple auth schemes may be supported. If the ADO deployment supports NTLM, the git clone will default
// to this over basic auth. As a result, we cannot embed the token in the clone URL and must force basic auth by passing in the token
@ -251,44 +241,3 @@ const createGitCloneUrlWithToken = (cloneUrl: string, credentials: { username?:
}
return url.toString();
}
/**
* Wraps groupmq worker lifecycle callbacks with exception handling. This prevents
* uncaught exceptions (e.g., like a RepoIndexingJob not existing in the DB) from crashing
* the app.
* @see: https://openpanel-dev.github.io/groupmq/api-worker/#events
*/
export const groupmqLifecycleExceptionWrapper = async (name: string, logger: Logger, fn: () => Promise<void>) => {
try {
await fn();
} catch (error) {
Sentry.captureException(error);
logger.error(`Exception thrown while executing lifecycle function \`${name}\`.`, error);
}
}
// setInterval wrapper that ensures async callbacks are not executed concurrently.
// @see: https://mottaquikarim.github.io/dev/posts/setinterval-that-blocks-on-await/
export const setIntervalAsync = (target: () => Promise<void>, pollingIntervalMs: number): NodeJS.Timeout => {
const setIntervalWithPromise = <T extends (...args: any[]) => Promise<any>>(
target: T
): (...args: Parameters<T>) => Promise<void> => {
return async function (...args: Parameters<T>): Promise<void> {
if ((target as any).isRunning) return;
(target as any).isRunning = true;
try {
await target(...args);
} finally {
(target as any).isRunning = false;
}
};
}
return setInterval(
setIntervalWithPromise(target),
pollingIntervalMs
);
}

View file

@ -1,39 +1,82 @@
import { Repo } from "@sourcebot/db";
import { createLogger, env } from "@sourcebot/shared";
import { exec } from "child_process";
import { INDEX_CACHE_DIR } from "./constants.js";
import { Settings } from "./types.js";
import { getRepoPath, getShardPrefix } from "./utils.js";
import { AppContext, repoMetadataSchema, Settings } from "./types.js";
import { Repo } from "@sourcebot/db";
import { getRepoPath } from "./utils.js";
import { getShardPrefix } from "./utils.js";
import { getBranches, getTags } from "./git.js";
import micromatch from "micromatch";
import { createLogger } from "@sourcebot/logger";
import { captureEvent } from "./posthog.js";
const logger = createLogger('zoekt');
export const indexGitRepository = async (repo: Repo, settings: Settings, revisions: string[], signal?: AbortSignal) => {
const { path: repoPath } = getRepoPath(repo);
export const indexGitRepository = async (repo: Repo, settings: Settings, ctx: AppContext) => {
let revisions = [
'HEAD'
];
const { path: repoPath } = getRepoPath(repo, ctx);
const shardPrefix = getShardPrefix(repo.orgId, repo.id);
const metadata = repoMetadataSchema.parse(repo.metadata);
const largeFileGlobPatterns = env.ALWAYS_INDEX_FILE_PATTERNS?.split(',').map(pattern => pattern.trim()) ?? [];
if (metadata.branches) {
const branchGlobs = metadata.branches
const allBranches = await getBranches(repoPath);
const matchingBranches =
allBranches
.filter((branch) => micromatch.isMatch(branch, branchGlobs))
.map((branch) => `refs/heads/${branch}`);
revisions = [
...revisions,
...matchingBranches
];
}
if (metadata.tags) {
const tagGlobs = metadata.tags;
const allTags = await getTags(repoPath);
const matchingTags =
allTags
.filter((tag) => micromatch.isMatch(tag, tagGlobs))
.map((tag) => `refs/tags/${tag}`);
revisions = [
...revisions,
...matchingTags
];
}
// zoekt has a limit of 64 branches/tags to index.
if (revisions.length > 64) {
logger.warn(`Too many revisions (${revisions.length}) for repo ${repo.id}, truncating to 64`);
captureEvent('backend_revisions_truncated', {
repoId: repo.id,
revisionCount: revisions.length,
});
revisions = revisions.slice(0, 64);
}
const command = [
'zoekt-git-index',
'-allow_missing_branches',
`-index ${INDEX_CACHE_DIR}`,
`-index ${ctx.indexPath}`,
`-max_trigram_count ${settings.maxTrigramCount}`,
`-file_limit ${settings.maxFileSize}`,
`-branches "${revisions.join(',')}"`,
`-tenant_id ${repo.orgId}`,
`-repo_id ${repo.id}`,
`-shard_prefix ${shardPrefix}`,
...largeFileGlobPatterns.map((pattern) => `-large_file ${pattern}`),
repoPath
].join(' ');
return new Promise<{ stdout: string, stderr: string }>((resolve, reject) => {
exec(command, { signal }, (error, stdout, stderr) => {
exec(command, (error, stdout, stderr) => {
if (error) {
reject(error);
return;
}
if (stdout) {
stdout.split('\n').filter(line => line.trim()).forEach(line => {
logger.info(line);
@ -46,7 +89,7 @@ export const indexGitRepository = async (repo: Repo, settings: Settings, revisio
logger.info(line);
});
}
resolve({
stdout,
stderr

View file

@ -4,8 +4,5 @@ export default defineConfig({
test: {
environment: 'node',
watch: false,
env: {
DATA_CACHE_DIR: 'test-data'
}
}
});

1
packages/crypto/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
.env.local

View file

@ -0,0 +1,19 @@
{
"name": "@sourcebot/crypto",
"version": "0.1.0",
"main": "dist/index.js",
"private": true,
"scripts": {
"build": "tsc",
"postinstall": "yarn build"
},
"dependencies": {
"@sourcebot/db": "*",
"@sourcebot/schemas": "*",
"dotenv": "^16.4.5"
},
"devDependencies": {
"@types/node": "^22.7.5",
"typescript": "^5.7.3"
}
}

View file

@ -0,0 +1,13 @@
import dotenv from 'dotenv';
export const getEnv = (env: string | undefined, defaultValue?: string) => {
return env ?? defaultValue;
}
dotenv.config({
path: './.env.local',
override: true
});
// @note: You can use https://generate-random.org/encryption-key-generator to create a new 32 byte key
export const SOURCEBOT_ENCRYPTION_KEY = getEnv(process.env.SOURCEBOT_ENCRYPTION_KEY);

View file

@ -1,8 +1,6 @@
import crypto from 'crypto';
import fs from 'fs';
import { env } from './env.server.js';
import { Token } from '@sourcebot/schemas/v3/shared.type';
import { SecretManagerServiceClient } from "@google-cloud/secret-manager";
import { SOURCEBOT_ENCRYPTION_KEY } from './environment';
const algorithm = 'aes-256-cbc';
const ivLength = 16; // 16 bytes for CBC
@ -14,7 +12,11 @@ const generateIV = (): Buffer => {
};
export function encrypt(text: string): { iv: string; encryptedData: string } {
const encryptionKey = Buffer.from(env.SOURCEBOT_ENCRYPTION_KEY, 'ascii');
if (!SOURCEBOT_ENCRYPTION_KEY) {
throw new Error('Encryption key is not set');
}
const encryptionKey = Buffer.from(SOURCEBOT_ENCRYPTION_KEY, 'ascii');
const iv = generateIV();
const cipher = crypto.createCipheriv(algorithm, encryptionKey, iv);
@ -26,10 +28,18 @@ export function encrypt(text: string): { iv: string; encryptedData: string } {
}
export function hashSecret(text: string): string {
return crypto.createHmac('sha256', env.SOURCEBOT_ENCRYPTION_KEY).update(text).digest('hex');
if (!SOURCEBOT_ENCRYPTION_KEY) {
throw new Error('Encryption key is not set');
}
return crypto.createHmac('sha256', SOURCEBOT_ENCRYPTION_KEY).update(text).digest('hex');
}
export function generateApiKey(): { key: string; hash: string } {
if (!SOURCEBOT_ENCRYPTION_KEY) {
throw new Error('Encryption key is not set');
}
const secret = crypto.randomBytes(32).toString('hex');
const hash = hashSecret(secret);
@ -40,7 +50,11 @@ export function generateApiKey(): { key: string; hash: string } {
}
export function decrypt(iv: string, encryptedText: string): string {
const encryptionKey = Buffer.from(env.SOURCEBOT_ENCRYPTION_KEY, 'ascii');
if (!SOURCEBOT_ENCRYPTION_KEY) {
throw new Error('Encryption key is not set');
}
const encryptionKey = Buffer.from(SOURCEBOT_ENCRYPTION_KEY, 'ascii');
const ivBuffer = Buffer.from(iv, 'hex');
const encryptedBuffer = Buffer.from(encryptedText, 'hex');
@ -78,30 +92,4 @@ export function verifySignature(data: string, signature: string, publicKeyPath:
}
}
export const getTokenFromConfig = async (token: Token): Promise<string> => {
if ('env' in token) {
const envToken = process.env[token.env];
if (!envToken) {
throw new Error(`Environment variable ${token.env} not found.`);
}
return envToken;
} else if ('googleCloudSecret' in token) {
try {
const client = new SecretManagerServiceClient();
const [response] = await client.accessSecretVersion({
name: token.googleCloudSecret,
});
if (!response.payload?.data) {
throw new Error(`Secret ${token.googleCloudSecret} not found.`);
}
return response.payload.data.toString();
} catch (error) {
throw new Error(`Failed to access Google Cloud secret ${token.googleCloudSecret}: ${error instanceof Error ? error.message : String(error)}`);
}
} else {
throw new Error('Invalid token configuration');
}
};
export { getTokenFromConfig } from './tokenUtils.js';

View file

@ -0,0 +1,33 @@
import { PrismaClient } from "@sourcebot/db";
import { Token } from "@sourcebot/schemas/v3/shared.type";
import { decrypt } from "./index.js";
export const getTokenFromConfig = async (token: Token, orgId: number, db: PrismaClient) => {
if ('secret' in token) {
const secretKey = token.secret;
const secret = await db.secret.findUnique({
where: {
orgId_key: {
key: secretKey,
orgId
}
}
});
if (!secret) {
throw new Error(`Secret with key ${secretKey} not found for org ${orgId}`);
}
const decryptedToken = decrypt(secret.iv, secret.encryptedValue);
return decryptedToken;
} else if ('env' in token) {
const envToken = process.env[token.env];
if (!envToken) {
throw new Error(`Environment variable ${token.env} not found.`);
}
return envToken;
} else {
throw new Error('Invalid token configuration');
}
};

View file

@ -2,7 +2,6 @@
"compilerOptions": {
"target": "ES2022",
"module": "Node16",
"moduleResolution": "Node16",
"lib": ["ES2023"],
"outDir": "dist",
"rootDir": "src",
@ -12,12 +11,14 @@
"strict": true,
"noImplicitAny": true,
"strictNullChecks": true,
"moduleResolution": "Node16",
"esModuleInterop": true,
"forceConsistentCasingInFileNames": true,
"skipLibCheck": true,
"isolatedModules": true,
"resolveJsonModule": true
},
"include": ["src/index.ts"],
"include": ["src/**/*"],
"exclude": ["node_modules", "dist"]
}
}

View file

@ -25,6 +25,7 @@
},
"dependencies": {
"@prisma/client": "6.2.1",
"@sourcebot/logger": "workspace:*",
"@types/readline-sync": "^1.4.8",
"readline-sync": "^1.4.10"
}

View file

@ -1,34 +0,0 @@
/*
Warnings:
- You are about to drop the column `repoIndexingStatus` on the `Repo` table. All the data in the column will be lost.
*/
-- CreateEnum
CREATE TYPE "RepoIndexingJobStatus" AS ENUM ('PENDING', 'IN_PROGRESS', 'COMPLETED', 'FAILED');
-- CreateEnum
CREATE TYPE "RepoIndexingJobType" AS ENUM ('INDEX', 'CLEANUP');
-- AlterTable
ALTER TABLE "Repo" DROP COLUMN "repoIndexingStatus";
-- DropEnum
DROP TYPE "RepoIndexingStatus";
-- CreateTable
CREATE TABLE "RepoIndexingJob" (
"id" TEXT NOT NULL,
"type" "RepoIndexingJobType" NOT NULL,
"status" "RepoIndexingJobStatus" NOT NULL DEFAULT 'PENDING',
"createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
"updatedAt" TIMESTAMP(3) NOT NULL,
"completedAt" TIMESTAMP(3),
"errorMessage" TEXT,
"repoId" INTEGER NOT NULL,
CONSTRAINT "RepoIndexingJob_pkey" PRIMARY KEY ("id")
);
-- AddForeignKey
ALTER TABLE "RepoIndexingJob" ADD CONSTRAINT "RepoIndexingJob_repoId_fkey" FOREIGN KEY ("repoId") REFERENCES "Repo"("id") ON DELETE CASCADE ON UPDATE CASCADE;

View file

@ -1,2 +0,0 @@
-- AlterTable
ALTER TABLE "Repo" ADD COLUMN "indexedCommitHash" TEXT;

Some files were not shown because too many files have changed in this diff Show more