add lezer tree -> grpc transformer

This commit is contained in:
bkellam 2025-11-15 17:17:52 -08:00
parent cfdadf29e0
commit 8dce2aaac3
9 changed files with 465 additions and 19 deletions

View file

@ -18,7 +18,7 @@
"dev:prisma:studio": "yarn with-env yarn workspace @sourcebot/db prisma:studio", "dev:prisma:studio": "yarn with-env yarn workspace @sourcebot/db prisma:studio",
"dev:prisma:migrate:reset": "yarn with-env yarn workspace @sourcebot/db prisma:migrate:reset", "dev:prisma:migrate:reset": "yarn with-env yarn workspace @sourcebot/db prisma:migrate:reset",
"dev:prisma:db:push": "yarn with-env yarn workspace @sourcebot/db prisma:db:push", "dev:prisma:db:push": "yarn with-env yarn workspace @sourcebot/db prisma:db:push",
"build:deps": "yarn workspaces foreach --recursive --topological --from '{@sourcebot/schemas,@sourcebot/db,@sourcebot/shared}' run build" "build:deps": "yarn workspaces foreach --recursive --topological --from '{@sourcebot/schemas,@sourcebot/db,@sourcebot/shared,@sourcebot/query-language}' run build"
}, },
"devDependencies": { "devDependencies": {
"concurrently": "^9.2.1", "concurrently": "^9.2.1",

View file

@ -1,10 +1,10 @@
{ {
"name": "@sourcebot/query-language", "name": "@sourcebot/query-language",
"private": true, "private": true,
"main": "dist/index.js",
"scripts": { "scripts": {
"build": "lezer-generator src/query.grammar -o src/parser --typeScript --names && tsc", "build": "lezer-generator src/query.grammar -o src/parser --typeScript --names && tsc",
"test": "vitest", "test": "vitest"
"asdf": "tsx test.ts"
}, },
"devDependencies": { "devDependencies": {
"@lezer/generator": "^1.8.0", "@lezer/generator": "^1.8.0",

View file

@ -0,0 +1,6 @@
import { parser } from "./parser";
type Tree = ReturnType<typeof parser.parse>;
type SyntaxNode = Tree['topNode'];
export type { Tree, SyntaxNode };
export * from "./parser";

View file

@ -0,0 +1,200 @@
# OR has lowest precedence - implicit AND groups first
a b or c d
==>
Program(OrExpr(AndExpr(Term,Term),AndExpr(Term,Term)))
# Multiple OR operators are left-associative
a or b or c
==>
Program(OrExpr(Term,Term,Term))
# AND before OR
file:test.js error or file:test.go panic
==>
Program(OrExpr(AndExpr(PrefixExpr(FileExpr),Term),AndExpr(PrefixExpr(FileExpr),Term)))
# Negation binds tighter than AND
-file:test.js error
==>
Program(AndExpr(NegateExpr(PrefixExpr(FileExpr)),Term))
# Negation binds tighter than OR
-file:a.js or file:b.js
==>
Program(OrExpr(NegateExpr(PrefixExpr(FileExpr)),PrefixExpr(FileExpr)))
# Parentheses override precedence
(a or b) c
==>
Program(AndExpr(ParenExpr(OrExpr(Term,Term)),Term))
# Parentheses override - OR inside parens groups first
a (b or c)
==>
Program(AndExpr(Term,ParenExpr(OrExpr(Term,Term))))
# Complex: AND, OR, and negation
a -b or c d
==>
Program(OrExpr(AndExpr(Term,Term),AndExpr(Term,Term)))
# Negated group in OR expression
-(a b) or c
==>
Program(OrExpr(NegateExpr(ParenExpr(AndExpr(Term,Term))),Term))
# Multiple negations in OR
-file:a.js or -file:b.js or file:c.js
==>
Program(OrExpr(NegateExpr(PrefixExpr(FileExpr)),NegateExpr(PrefixExpr(FileExpr)),PrefixExpr(FileExpr)))
# Prefix binds to its value only
file:a.js b.js
==>
Program(AndExpr(PrefixExpr(FileExpr),Term))
# OR with prefixes and terms mixed
repo:backend error or repo:frontend warning
==>
Program(OrExpr(AndExpr(PrefixExpr(RepoExpr),Term),AndExpr(PrefixExpr(RepoExpr),Term)))
# Nested parentheses with OR
((a or b) c) or d
==>
Program(OrExpr(ParenExpr(AndExpr(ParenExpr(OrExpr(Term,Term)),Term)),Term))
# OR at different nesting levels
(a or (b or c))
==>
Program(ParenExpr(OrExpr(Term,ParenExpr(OrExpr(Term,Term)))))
# Implicit AND groups all adjacent terms before OR
a b c or d e f
==>
Program(OrExpr(AndExpr(Term,Term,Term),AndExpr(Term,Term,Term)))
# Mixed prefix and regular terms with OR
lang:go func or lang:rust fn
==>
Program(OrExpr(AndExpr(PrefixExpr(LangExpr),Term),AndExpr(PrefixExpr(LangExpr),Term)))
# Negation doesn't affect OR grouping
a or -b or c
==>
Program(OrExpr(Term,Term,Term))
# Parentheses can isolate OR from surrounding AND
a (b or c) d
==>
Program(AndExpr(Term,ParenExpr(OrExpr(Term,Term)),Term))
# Multiple parenthesized groups with AND
(a or b) (c or d)
==>
Program(AndExpr(ParenExpr(OrExpr(Term,Term)),ParenExpr(OrExpr(Term,Term))))
# Quoted strings are atomic - no precedence inside
"a or b"
==>
Program(Term)
# Prefix with OR value doesn't split
file:"a.js or b.js"
==>
Program(PrefixExpr(FileExpr))
# Negated prefix in complex expression
-file:test.js lang:go error or warning
==>
Program(OrExpr(AndExpr(NegateExpr(PrefixExpr(FileExpr)),PrefixExpr(LangExpr),Term),Term))
# OR followed by parenthesized AND
a or (b c)
==>
Program(OrExpr(Term,ParenExpr(AndExpr(Term,Term))))
# Empty parens don't affect precedence
() or a b
==>
Program(OrExpr(ParenExpr(Term(⚠)),AndExpr(Term,Term)))
# Negation of empty group
-() a
==>
Program(AndExpr(NegateExpr(ParenExpr(Term(⚠))),Term))

View file

@ -18,6 +18,6 @@
"isolatedModules": true, "isolatedModules": true,
"resolveJsonModule": true "resolveJsonModule": true
}, },
"include": ["src/parser.ts"], "include": ["src/index.ts"],
"exclude": ["node_modules", "dist"] "exclude": ["node_modules", "dist"]
} }

View file

@ -94,6 +94,7 @@
"@shopify/lang-jsonc": "^1.0.0", "@shopify/lang-jsonc": "^1.0.0",
"@sourcebot/codemirror-lang-tcl": "^1.0.12", "@sourcebot/codemirror-lang-tcl": "^1.0.12",
"@sourcebot/db": "workspace:*", "@sourcebot/db": "workspace:*",
"@sourcebot/query-language": "workspace:*",
"@sourcebot/schemas": "workspace:*", "@sourcebot/schemas": "workspace:*",
"@sourcebot/shared": "workspace:*", "@sourcebot/shared": "workspace:*",
"@ssddanbrown/codemirror-lang-twig": "^1.0.0", "@ssddanbrown/codemirror-lang-twig": "^1.0.0",

View file

@ -16,6 +16,8 @@ import { PrismaClient, Repo } from '@sourcebot/db';
import { createLogger, env } from '@sourcebot/shared'; import { createLogger, env } from '@sourcebot/shared';
import { NextRequest } from 'next/server'; import { NextRequest } from 'next/server';
import * as path from 'path'; import * as path from 'path';
import { parser } from '@sourcebot/query-language';
import { transformToZoektQuery } from './transformer';
const logger = createLogger('streamSearchApi'); const logger = createLogger('streamSearchApi');
@ -67,21 +69,27 @@ export const POST = async (request: NextRequest) => {
const { query, matches, contextLines, whole } = parsed.data; const { query, matches, contextLines, whole } = parsed.data;
const tree = parser.parse(query);
const zoektQuery = transformToZoektQuery(tree, query);
console.log(JSON.stringify(zoektQuery, null, 2));
const searchRequest: SearchRequest = { const searchRequest: SearchRequest = {
query: { query: zoektQuery,
and: { // query: {
// @todo: we should use repo_ids to filter out repositories that the user // and: {
// has access to (if permission syncing is enabled!). // // @todo: we should use repo_ids to filter out repositories that the user
children: [ // // has access to (if permission syncing is enabled!).
{ // children: [
regexp: { // {
regexp: query, // regexp: {
case_sensitive: true, // regexp: query,
} // case_sensitive: true,
} // }
] // }
} // ]
}, // }
// },
opts: { opts: {
chunk_matches: true, chunk_matches: true,
max_match_display_count: matches, max_match_display_count: matches,

View file

@ -0,0 +1,230 @@
import { Tree, SyntaxNode } from "@sourcebot/query-language";
import { Q } from '@/proto/zoekt/webserver/v1/Q';
/**
* Transform a Lezer parse tree into a Zoekt gRPC query
*/
export function transformToZoektQuery(tree: Tree, input: string): Q {
return transformNode(tree.topNode, input);
}
function transformNode(node: SyntaxNode, input: string): Q {
const nodeName = node.type.name;
switch (nodeName) {
case "Program": {
// Program wraps the actual query - transform its child
const child = node.firstChild;
if (!child) {
// Empty query - match nothing
return { const: false, query: "const" };
}
return transformNode(child, input);
}
case "AndExpr":
return {
and: {
children: getChildren(node).map(c => transformNode(c, input))
},
query: "and"
}
case "OrExpr":
return {
or: {
children: getChildren(node).map(c => transformNode(c, input))
},
query: "or"
};
case "NegateExpr": {
// Find the child after the negate token
const negateChild = node.getChild("PrefixExpr") || node.getChild("ParenExpr");
if (!negateChild) {
throw new Error("NegateExpr missing child");
}
return {
not: {
child: transformNode(negateChild, input)
},
query: "not"
};
}
case "ParenExpr": {
// Parentheses just group - transform the inner query
const innerQuery = node.getChild("query") || node.firstChild;
if (!innerQuery) {
return { const: false, query: "const" };
}
return transformNode(innerQuery, input);
}
case "PrefixExpr":
// PrefixExpr contains specific prefix types
return transformPrefixExpr(node, input);
case "Term": {
// Plain search term - becomes substring search in content
const termText = input.substring(node.from, node.to);
return {
substring: {
pattern: termText.replace(/^"|"$/g, ''), // Remove quotes if present
case_sensitive: false,
file_name: false,
content: true
},
query: "substring"
};
}
default:
console.warn(`Unhandled node type: ${nodeName}`);
return { const: true, query: "const" };
}
}
function transformPrefixExpr(node: SyntaxNode, input: string): Q {
// Find which specific prefix type this is
const prefixNode = node.firstChild;
if (!prefixNode) {
throw new Error("PrefixExpr has no child");
}
const prefixType = prefixNode.type.name;
// Extract the full text (e.g., "file:test.js") and split on the colon
const fullText = input.substring(prefixNode.from, prefixNode.to);
const colonIndex = fullText.indexOf(':');
if (colonIndex === -1) {
throw new Error(`${prefixType} missing colon`);
}
// Get the value part after the colon and remove quotes if present
const value = fullText.substring(colonIndex + 1).replace(/^"|"$/g, '');
switch (prefixType) {
case "FileExpr":
return {
substring: {
pattern: value,
case_sensitive: false,
file_name: true,
content: false
},
query: "substring"
};
case "RepoExpr":
return {
repo: {
regexp: value
},
query: "repo"
};
case "BranchExpr":
return {
branch: {
pattern: value,
exact: false
},
query: "branch"
};
case "ContentExpr":
return {
substring: {
pattern: value,
case_sensitive: false,
file_name: false,
content: true
},
query: "substring"
};
case "CaseExpr": {
// case:yes/no wraps the next term with case sensitivity
const caseValue = value.toLowerCase();
const isCaseSensitive = caseValue === "yes" || caseValue === "true";
return {
substring: {
pattern: value,
case_sensitive: isCaseSensitive,
file_name: false,
content: true
},
query: "substring"
};
}
case "LangExpr":
return {
language: {
language: value
},
query: "language"
};
case "SymExpr":
// Symbol search wraps a pattern
return {
symbol: {
expr: {
substring: {
pattern: value,
case_sensitive: false,
file_name: false,
content: true
},
query: "substring"
}
},
query: "symbol"
};
case "RegexExpr":
return {
regexp: {
regexp: value,
case_sensitive: false,
file_name: false,
content: true
},
query: "regexp"
};
// @todo: handle this
case "ArchivedExpr":
case "ForkExpr":
case "PublicExpr":
// These are repo metadata filters
// They need to be handled via repo filters in Zoekt
// For now, return a const query (you might need custom handling)
console.warn(`${prefixType} not yet implemented`);
return { const: true, query: "const" };
case "RepoSetExpr": {
return {
repo_set: {
set: value.split(',').reduce((acc, s) => {
acc[s.trim()] = true;
return acc;
}, {} as Record<string, boolean>)
},
query: "repo_set"
};
}
default:
throw new Error(`Unknown prefix type: ${prefixType}`);
}
}
function getChildren(node: SyntaxNode): SyntaxNode[] {
const children: SyntaxNode[] = [];
let child = node.firstChild;
while (child) {
// Skip certain node types that are just structural
if (!["(", ")", "or"].includes(child.type.name)) {
children.push(child);
}
child = child.nextSibling;
}
return children;
}

View file

@ -8016,7 +8016,7 @@ __metadata:
languageName: unknown languageName: unknown
linkType: soft linkType: soft
"@sourcebot/query-language@workspace:packages/queryLanguage": "@sourcebot/query-language@workspace:*, @sourcebot/query-language@workspace:packages/queryLanguage":
version: 0.0.0-use.local version: 0.0.0-use.local
resolution: "@sourcebot/query-language@workspace:packages/queryLanguage" resolution: "@sourcebot/query-language@workspace:packages/queryLanguage"
dependencies: dependencies:
@ -8152,6 +8152,7 @@ __metadata:
"@shopify/lang-jsonc": "npm:^1.0.0" "@shopify/lang-jsonc": "npm:^1.0.0"
"@sourcebot/codemirror-lang-tcl": "npm:^1.0.12" "@sourcebot/codemirror-lang-tcl": "npm:^1.0.12"
"@sourcebot/db": "workspace:*" "@sourcebot/db": "workspace:*"
"@sourcebot/query-language": "workspace:*"
"@sourcebot/schemas": "workspace:*" "@sourcebot/schemas": "workspace:*"
"@sourcebot/shared": "workspace:*" "@sourcebot/shared": "workspace:*"
"@ssddanbrown/codemirror-lang-twig": "npm:^1.0.0" "@ssddanbrown/codemirror-lang-twig": "npm:^1.0.0"