From 8dce2aaac3e7b68aa55a7d0c2c941b6b56dba46a Mon Sep 17 00:00:00 2001 From: bkellam Date: Sat, 15 Nov 2025 17:17:52 -0800 Subject: [PATCH] add lezer tree -> grpc transformer --- package.json | 2 +- packages/queryLanguage/package.json | 4 +- packages/queryLanguage/src/index.ts | 6 + packages/queryLanguage/test/precedence.txt | 200 +++++++++++++++ packages/queryLanguage/tsconfig.json | 2 +- packages/web/package.json | 1 + .../app/api/(server)/stream_search/route.ts | 36 +-- .../api/(server)/stream_search/transformer.ts | 230 ++++++++++++++++++ yarn.lock | 3 +- 9 files changed, 465 insertions(+), 19 deletions(-) create mode 100644 packages/queryLanguage/src/index.ts create mode 100644 packages/queryLanguage/test/precedence.txt create mode 100644 packages/web/src/app/api/(server)/stream_search/transformer.ts diff --git a/package.json b/package.json index a70bab99..4f3c2d2c 100644 --- a/package.json +++ b/package.json @@ -18,7 +18,7 @@ "dev:prisma:studio": "yarn with-env yarn workspace @sourcebot/db prisma:studio", "dev:prisma:migrate:reset": "yarn with-env yarn workspace @sourcebot/db prisma:migrate:reset", "dev:prisma:db:push": "yarn with-env yarn workspace @sourcebot/db prisma:db:push", - "build:deps": "yarn workspaces foreach --recursive --topological --from '{@sourcebot/schemas,@sourcebot/db,@sourcebot/shared}' run build" + "build:deps": "yarn workspaces foreach --recursive --topological --from '{@sourcebot/schemas,@sourcebot/db,@sourcebot/shared,@sourcebot/query-language}' run build" }, "devDependencies": { "concurrently": "^9.2.1", diff --git a/packages/queryLanguage/package.json b/packages/queryLanguage/package.json index f1659da1..7486ff19 100644 --- a/packages/queryLanguage/package.json +++ b/packages/queryLanguage/package.json @@ -1,10 +1,10 @@ { "name": "@sourcebot/query-language", "private": true, + "main": "dist/index.js", "scripts": { "build": "lezer-generator src/query.grammar -o src/parser --typeScript --names && tsc", - "test": "vitest", - "asdf": "tsx test.ts" + "test": "vitest" }, "devDependencies": { "@lezer/generator": "^1.8.0", diff --git a/packages/queryLanguage/src/index.ts b/packages/queryLanguage/src/index.ts new file mode 100644 index 00000000..a5d01903 --- /dev/null +++ b/packages/queryLanguage/src/index.ts @@ -0,0 +1,6 @@ +import { parser } from "./parser"; + +type Tree = ReturnType; +type SyntaxNode = Tree['topNode']; +export type { Tree, SyntaxNode }; +export * from "./parser"; \ No newline at end of file diff --git a/packages/queryLanguage/test/precedence.txt b/packages/queryLanguage/test/precedence.txt new file mode 100644 index 00000000..d43e5b34 --- /dev/null +++ b/packages/queryLanguage/test/precedence.txt @@ -0,0 +1,200 @@ +# OR has lowest precedence - implicit AND groups first + +a b or c d + +==> + +Program(OrExpr(AndExpr(Term,Term),AndExpr(Term,Term))) + +# Multiple OR operators are left-associative + +a or b or c + +==> + +Program(OrExpr(Term,Term,Term)) + +# AND before OR + +file:test.js error or file:test.go panic + +==> + +Program(OrExpr(AndExpr(PrefixExpr(FileExpr),Term),AndExpr(PrefixExpr(FileExpr),Term))) + +# Negation binds tighter than AND + +-file:test.js error + +==> + +Program(AndExpr(NegateExpr(PrefixExpr(FileExpr)),Term)) + +# Negation binds tighter than OR + +-file:a.js or file:b.js + +==> + +Program(OrExpr(NegateExpr(PrefixExpr(FileExpr)),PrefixExpr(FileExpr))) + +# Parentheses override precedence + +(a or b) c + +==> + +Program(AndExpr(ParenExpr(OrExpr(Term,Term)),Term)) + +# Parentheses override - OR inside parens groups first + +a (b or c) + +==> + +Program(AndExpr(Term,ParenExpr(OrExpr(Term,Term)))) + +# Complex: AND, OR, and negation + +a -b or c d + +==> + +Program(OrExpr(AndExpr(Term,Term),AndExpr(Term,Term))) + +# Negated group in OR expression + +-(a b) or c + +==> + +Program(OrExpr(NegateExpr(ParenExpr(AndExpr(Term,Term))),Term)) + +# Multiple negations in OR + +-file:a.js or -file:b.js or file:c.js + +==> + +Program(OrExpr(NegateExpr(PrefixExpr(FileExpr)),NegateExpr(PrefixExpr(FileExpr)),PrefixExpr(FileExpr))) + +# Prefix binds to its value only + +file:a.js b.js + +==> + +Program(AndExpr(PrefixExpr(FileExpr),Term)) + +# OR with prefixes and terms mixed + +repo:backend error or repo:frontend warning + +==> + +Program(OrExpr(AndExpr(PrefixExpr(RepoExpr),Term),AndExpr(PrefixExpr(RepoExpr),Term))) + +# Nested parentheses with OR + +((a or b) c) or d + +==> + +Program(OrExpr(ParenExpr(AndExpr(ParenExpr(OrExpr(Term,Term)),Term)),Term)) + +# OR at different nesting levels + +(a or (b or c)) + +==> + +Program(ParenExpr(OrExpr(Term,ParenExpr(OrExpr(Term,Term))))) + +# Implicit AND groups all adjacent terms before OR + +a b c or d e f + +==> + +Program(OrExpr(AndExpr(Term,Term,Term),AndExpr(Term,Term,Term))) + +# Mixed prefix and regular terms with OR + +lang:go func or lang:rust fn + +==> + +Program(OrExpr(AndExpr(PrefixExpr(LangExpr),Term),AndExpr(PrefixExpr(LangExpr),Term))) + +# Negation doesn't affect OR grouping + +a or -b or c + +==> + +Program(OrExpr(Term,Term,Term)) + +# Parentheses can isolate OR from surrounding AND + +a (b or c) d + +==> + +Program(AndExpr(Term,ParenExpr(OrExpr(Term,Term)),Term)) + +# Multiple parenthesized groups with AND + +(a or b) (c or d) + +==> + +Program(AndExpr(ParenExpr(OrExpr(Term,Term)),ParenExpr(OrExpr(Term,Term)))) + +# Quoted strings are atomic - no precedence inside + +"a or b" + +==> + +Program(Term) + +# Prefix with OR value doesn't split + +file:"a.js or b.js" + +==> + +Program(PrefixExpr(FileExpr)) + +# Negated prefix in complex expression + +-file:test.js lang:go error or warning + +==> + +Program(OrExpr(AndExpr(NegateExpr(PrefixExpr(FileExpr)),PrefixExpr(LangExpr),Term),Term)) + +# OR followed by parenthesized AND + +a or (b c) + +==> + +Program(OrExpr(Term,ParenExpr(AndExpr(Term,Term)))) + +# Empty parens don't affect precedence + +() or a b + +==> + +Program(OrExpr(ParenExpr(Term(⚠)),AndExpr(Term,Term))) + +# Negation of empty group + +-() a + +==> + +Program(AndExpr(NegateExpr(ParenExpr(Term(⚠))),Term)) + diff --git a/packages/queryLanguage/tsconfig.json b/packages/queryLanguage/tsconfig.json index f2284da6..af60924e 100644 --- a/packages/queryLanguage/tsconfig.json +++ b/packages/queryLanguage/tsconfig.json @@ -18,6 +18,6 @@ "isolatedModules": true, "resolveJsonModule": true }, - "include": ["src/parser.ts"], + "include": ["src/index.ts"], "exclude": ["node_modules", "dist"] } \ No newline at end of file diff --git a/packages/web/package.json b/packages/web/package.json index 4cea2bac..a4f58011 100644 --- a/packages/web/package.json +++ b/packages/web/package.json @@ -94,6 +94,7 @@ "@shopify/lang-jsonc": "^1.0.0", "@sourcebot/codemirror-lang-tcl": "^1.0.12", "@sourcebot/db": "workspace:*", + "@sourcebot/query-language": "workspace:*", "@sourcebot/schemas": "workspace:*", "@sourcebot/shared": "workspace:*", "@ssddanbrown/codemirror-lang-twig": "^1.0.0", diff --git a/packages/web/src/app/api/(server)/stream_search/route.ts b/packages/web/src/app/api/(server)/stream_search/route.ts index ac834d35..118d32c6 100644 --- a/packages/web/src/app/api/(server)/stream_search/route.ts +++ b/packages/web/src/app/api/(server)/stream_search/route.ts @@ -16,6 +16,8 @@ import { PrismaClient, Repo } from '@sourcebot/db'; import { createLogger, env } from '@sourcebot/shared'; import { NextRequest } from 'next/server'; import * as path from 'path'; +import { parser } from '@sourcebot/query-language'; +import { transformToZoektQuery } from './transformer'; const logger = createLogger('streamSearchApi'); @@ -67,21 +69,27 @@ export const POST = async (request: NextRequest) => { const { query, matches, contextLines, whole } = parsed.data; + const tree = parser.parse(query); + const zoektQuery = transformToZoektQuery(tree, query); + + console.log(JSON.stringify(zoektQuery, null, 2)); + const searchRequest: SearchRequest = { - query: { - and: { - // @todo: we should use repo_ids to filter out repositories that the user - // has access to (if permission syncing is enabled!). - children: [ - { - regexp: { - regexp: query, - case_sensitive: true, - } - } - ] - } - }, + query: zoektQuery, + // query: { + // and: { + // // @todo: we should use repo_ids to filter out repositories that the user + // // has access to (if permission syncing is enabled!). + // children: [ + // { + // regexp: { + // regexp: query, + // case_sensitive: true, + // } + // } + // ] + // } + // }, opts: { chunk_matches: true, max_match_display_count: matches, diff --git a/packages/web/src/app/api/(server)/stream_search/transformer.ts b/packages/web/src/app/api/(server)/stream_search/transformer.ts new file mode 100644 index 00000000..61dfb0b7 --- /dev/null +++ b/packages/web/src/app/api/(server)/stream_search/transformer.ts @@ -0,0 +1,230 @@ +import { Tree, SyntaxNode } from "@sourcebot/query-language"; +import { Q } from '@/proto/zoekt/webserver/v1/Q'; + +/** + * Transform a Lezer parse tree into a Zoekt gRPC query + */ +export function transformToZoektQuery(tree: Tree, input: string): Q { + return transformNode(tree.topNode, input); +} + +function transformNode(node: SyntaxNode, input: string): Q { + const nodeName = node.type.name; + + switch (nodeName) { + case "Program": { + // Program wraps the actual query - transform its child + const child = node.firstChild; + if (!child) { + // Empty query - match nothing + return { const: false, query: "const" }; + } + return transformNode(child, input); + } + case "AndExpr": + return { + and: { + children: getChildren(node).map(c => transformNode(c, input)) + }, + query: "and" + } + + case "OrExpr": + return { + or: { + children: getChildren(node).map(c => transformNode(c, input)) + }, + query: "or" + }; + + case "NegateExpr": { + // Find the child after the negate token + const negateChild = node.getChild("PrefixExpr") || node.getChild("ParenExpr"); + if (!negateChild) { + throw new Error("NegateExpr missing child"); + } + return { + not: { + child: transformNode(negateChild, input) + }, + query: "not" + }; + } + case "ParenExpr": { + // Parentheses just group - transform the inner query + const innerQuery = node.getChild("query") || node.firstChild; + if (!innerQuery) { + return { const: false, query: "const" }; + } + return transformNode(innerQuery, input); + } + case "PrefixExpr": + // PrefixExpr contains specific prefix types + return transformPrefixExpr(node, input); + + case "Term": { + // Plain search term - becomes substring search in content + const termText = input.substring(node.from, node.to); + return { + substring: { + pattern: termText.replace(/^"|"$/g, ''), // Remove quotes if present + case_sensitive: false, + file_name: false, + content: true + }, + query: "substring" + }; + } + default: + console.warn(`Unhandled node type: ${nodeName}`); + return { const: true, query: "const" }; + } +} + +function transformPrefixExpr(node: SyntaxNode, input: string): Q { + // Find which specific prefix type this is + const prefixNode = node.firstChild; + if (!prefixNode) { + throw new Error("PrefixExpr has no child"); + } + + const prefixType = prefixNode.type.name; + + // Extract the full text (e.g., "file:test.js") and split on the colon + const fullText = input.substring(prefixNode.from, prefixNode.to); + const colonIndex = fullText.indexOf(':'); + if (colonIndex === -1) { + throw new Error(`${prefixType} missing colon`); + } + + // Get the value part after the colon and remove quotes if present + const value = fullText.substring(colonIndex + 1).replace(/^"|"$/g, ''); + + switch (prefixType) { + case "FileExpr": + return { + substring: { + pattern: value, + case_sensitive: false, + file_name: true, + content: false + }, + query: "substring" + }; + + case "RepoExpr": + return { + repo: { + regexp: value + }, + query: "repo" + }; + + case "BranchExpr": + return { + branch: { + pattern: value, + exact: false + }, + query: "branch" + }; + + case "ContentExpr": + return { + substring: { + pattern: value, + case_sensitive: false, + file_name: false, + content: true + }, + query: "substring" + }; + + case "CaseExpr": { + // case:yes/no wraps the next term with case sensitivity + const caseValue = value.toLowerCase(); + const isCaseSensitive = caseValue === "yes" || caseValue === "true"; + return { + substring: { + pattern: value, + case_sensitive: isCaseSensitive, + file_name: false, + content: true + }, + query: "substring" + }; + } + case "LangExpr": + return { + language: { + language: value + }, + query: "language" + }; + + case "SymExpr": + // Symbol search wraps a pattern + return { + symbol: { + expr: { + substring: { + pattern: value, + case_sensitive: false, + file_name: false, + content: true + }, + query: "substring" + } + }, + query: "symbol" + }; + case "RegexExpr": + return { + regexp: { + regexp: value, + case_sensitive: false, + file_name: false, + content: true + }, + query: "regexp" + }; + + // @todo: handle this + case "ArchivedExpr": + case "ForkExpr": + case "PublicExpr": + // These are repo metadata filters + // They need to be handled via repo filters in Zoekt + // For now, return a const query (you might need custom handling) + console.warn(`${prefixType} not yet implemented`); + return { const: true, query: "const" }; + + case "RepoSetExpr": { + return { + repo_set: { + set: value.split(',').reduce((acc, s) => { + acc[s.trim()] = true; + return acc; + }, {} as Record) + }, + query: "repo_set" + }; + } + default: + throw new Error(`Unknown prefix type: ${prefixType}`); + } +} + +function getChildren(node: SyntaxNode): SyntaxNode[] { + const children: SyntaxNode[] = []; + let child = node.firstChild; + while (child) { + // Skip certain node types that are just structural + if (!["(", ")", "or"].includes(child.type.name)) { + children.push(child); + } + child = child.nextSibling; + } + return children; +} + diff --git a/yarn.lock b/yarn.lock index 9ca01d8a..9b148710 100644 --- a/yarn.lock +++ b/yarn.lock @@ -8016,7 +8016,7 @@ __metadata: languageName: unknown linkType: soft -"@sourcebot/query-language@workspace:packages/queryLanguage": +"@sourcebot/query-language@workspace:*, @sourcebot/query-language@workspace:packages/queryLanguage": version: 0.0.0-use.local resolution: "@sourcebot/query-language@workspace:packages/queryLanguage" dependencies: @@ -8152,6 +8152,7 @@ __metadata: "@shopify/lang-jsonc": "npm:^1.0.0" "@sourcebot/codemirror-lang-tcl": "npm:^1.0.12" "@sourcebot/db": "workspace:*" + "@sourcebot/query-language": "workspace:*" "@sourcebot/schemas": "workspace:*" "@sourcebot/shared": "workspace:*" "@ssddanbrown/codemirror-lang-twig": "npm:^1.0.0"