further wip on query language

This commit is contained in:
bkellam 2025-11-16 16:12:51 -08:00
parent ac9d05a262
commit b966b63976
11 changed files with 369 additions and 400 deletions

View file

@ -4,3 +4,4 @@ type Tree = ReturnType<typeof parser.parse>;
type SyntaxNode = Tree['topNode'];
export type { Tree, SyntaxNode };
export * from "./parser";
export * from "./parser.terms";

View file

@ -1,10 +1,20 @@
// This file was generated by lezer-generator. You probably shouldn't edit it.
export const
negate = 24,
negate = 21,
Program = 1,
OrExpr = 2,
AndExpr = 3,
NegateExpr = 4,
PrefixExpr = 5,
ParenExpr = 19,
Term = 20
ArchivedExpr = 6,
RevisionExpr = 7,
ContentExpr = 8,
FileExpr = 9,
ForkExpr = 10,
VisibilityExpr = 11,
RepoExpr = 12,
LangExpr = 13,
SymExpr = 14,
RepoSetExpr = 15,
ParenExpr = 16,
Term = 17

File diff suppressed because one or more lines are too long

View file

@ -32,21 +32,31 @@ NegateExpr { !negate negate (PrefixExpr | ParenExpr) }
ParenExpr { "(" query ")" }
PrefixExpr {
ArchivedExpr { archivedKw value } |
BranchExpr { branchKw value } |
ContentExpr { contentKw value } |
CaseExpr { caseKw value } |
FileExpr { fileKw value } |
ForkExpr { forkKw value } |
PublicExpr { publicKw value } |
RepoExpr { repoKw value } |
RegexExpr { regexKw value } |
LangExpr { langKw value } |
SymExpr { symKw value } |
TypeExpr { typeKw value } |
RepoSetExpr { reposetKw value }
ArchivedExpr |
RevisionExpr |
ContentExpr |
FileExpr |
ForkExpr |
VisibilityExpr |
RepoExpr |
LangExpr |
SymExpr |
RepoSetExpr
}
RevisionExpr { revisionKw value }
ContentExpr { contentKw value }
FileExpr { fileKw value }
RepoExpr { repoKw value }
LangExpr { langKw value }
SymExpr { symKw value }
RepoSetExpr { reposetKw value }
// Modifiers
ArchivedExpr { archivedKw value }
ForkExpr { forkKw value }
VisibilityExpr { visibilityKw value }
Term { quotedString | word }
@ -56,17 +66,14 @@ value { quotedString | word }
@tokens {
archivedKw { "archived:" }
branchKw { "branch:" | "b:" }
revisionKw { "rev:" }
contentKw { "content:" | "c:" }
caseKw { "case:" }
fileKw { "file:" | "f:" }
forkKw { "fork:" }
publicKw { "public:" }
visibilityKw { "visibility:" }
repoKw { "repo:" | "r:" }
regexKw { "regex:" }
langKw { "lang:" }
symKw { "sym:" }
typeKw { "type:" | "t:" }
reposetKw { "reposet:" }
or { "or" ![a-zA-Z0-9_] }
@ -81,9 +88,9 @@ value { quotedString | word }
@precedence {
quotedString,
archivedKw, branchKw, contentKw, caseKw, fileKw,
forkKw, publicKw, repoKw, regexKw, langKw,
symKw, typeKw, reposetKw, or,
archivedKw, revisionKw, contentKw, fileKw,
forkKw, visibilityKw, repoKw, langKw,
symKw, reposetKw, or,
word
}
}

View file

@ -3,7 +3,7 @@ import { negate } from "./parser.terms";
// External tokenizer for negation
// Only tokenizes `-` as negate when followed by a prefix keyword or `(`
export const negateToken = new ExternalTokenizer((input, stack) => {
export const negateToken = new ExternalTokenizer((input) => {
if (input.next !== 45 /* '-' */) return; // Not a dash
const startPos = input.pos;
@ -25,24 +25,22 @@ export const negateToken = new ExternalTokenizer((input, stack) => {
}
// Check if followed by a prefix keyword (by checking for keyword followed by colon)
// We need to look ahead to find the colon
// Look ahead until we hit a delimiter or colon
const checkPos = input.pos;
let foundColon = false;
let charCount = 0;
// Look ahead up to 10 characters to find a colon
while (charCount < 10 && ch >= 0) {
// Look ahead until we hit a delimiter or colon
while (ch >= 0) {
if (ch === 58 /* ':' */) {
foundColon = true;
break;
}
// Hit a delimiter (whitespace, paren, or quote) - not a prefix keyword
if (ch === 32 || ch === 9 || ch === 10 || ch === 40 || ch === 41 || ch === 34) {
// Hit whitespace, paren, or quote - not a prefix
break;
}
input.advance();
ch = input.next;
charCount++;
}
// Reset position

View file

@ -62,21 +62,13 @@ Program(NegateExpr(PrefixExpr(LangExpr)))
Program(NegateExpr(PrefixExpr(ContentExpr)))
# Negate branch prefix
# Negate revision prefix
-branch:develop
-rev:develop
==>
Program(NegateExpr(PrefixExpr(BranchExpr)))
# Negate case prefix
-case:yes
==>
Program(NegateExpr(PrefixExpr(CaseExpr)))
Program(NegateExpr(PrefixExpr(RevisionExpr)))
# Negate archived prefix
@ -94,13 +86,13 @@ Program(NegateExpr(PrefixExpr(ArchivedExpr)))
Program(NegateExpr(PrefixExpr(ForkExpr)))
# Negate public prefix
# Negate visibility prefix
-public:no
-visibility:any
==>
Program(NegateExpr(PrefixExpr(PublicExpr)))
Program(NegateExpr(PrefixExpr(VisibilityExpr)))
# Negate symbol prefix
@ -110,22 +102,6 @@ Program(NegateExpr(PrefixExpr(PublicExpr)))
Program(NegateExpr(PrefixExpr(SymExpr)))
# Negate type prefix
-type:repo
==>
Program(NegateExpr(PrefixExpr(TypeExpr)))
# Negate regex prefix
-regex:test.*
==>
Program(NegateExpr(PrefixExpr(RegexExpr)))
# Negate parentheses
-(test)
@ -222,14 +198,6 @@ Program(NegateExpr(PrefixExpr(FileExpr)))
Program(NegateExpr(PrefixExpr(RepoExpr)))
# Negate short form branch
-b:main
==>
Program(NegateExpr(PrefixExpr(BranchExpr)))
# Negate short form content
-c:console
@ -238,14 +206,6 @@ Program(NegateExpr(PrefixExpr(BranchExpr)))
Program(NegateExpr(PrefixExpr(ContentExpr)))
# Negate short form type
-t:file
==>
Program(NegateExpr(PrefixExpr(TypeExpr)))
# Negate with prefix in quotes
-file:"test file.js"

View file

@ -110,13 +110,13 @@ repo:project1 or repo:project2
Program(OrExpr(PrefixExpr(RepoExpr),PrefixExpr(RepoExpr)))
# OR with branch prefixes
# OR with revision prefixes
branch:main or branch:develop
rev:main or rev:develop
==>
Program(OrExpr(PrefixExpr(BranchExpr),PrefixExpr(BranchExpr)))
Program(OrExpr(PrefixExpr(RevisionExpr),PrefixExpr(RevisionExpr)))
# OR with lang prefixes

View file

@ -46,21 +46,13 @@ c:console.log
Program(PrefixExpr(ContentExpr))
# Branch prefix
# Revision prefix
branch:main
rev:main
==>
Program(PrefixExpr(BranchExpr))
# Branch prefix short form
b:develop
==>
Program(PrefixExpr(BranchExpr))
Program(PrefixExpr(RevisionExpr))
# Lang prefix
@ -70,14 +62,6 @@ lang:typescript
Program(PrefixExpr(LangExpr))
# Case prefix
case:yes
==>
Program(PrefixExpr(CaseExpr))
# Archived prefix
archived:no
@ -94,13 +78,13 @@ fork:yes
Program(PrefixExpr(ForkExpr))
# Public prefix
# Visibility prefix - public
public:yes
visibility:public
==>
Program(PrefixExpr(PublicExpr))
Program(PrefixExpr(VisibilityExpr))
# Symbol prefix
@ -110,30 +94,6 @@ sym:MyClass
Program(PrefixExpr(SymExpr))
# Type prefix
type:file
==>
Program(PrefixExpr(TypeExpr))
# Type prefix short form
t:repo
==>
Program(PrefixExpr(TypeExpr))
# Regex prefix
regex:test.*
==>
Program(PrefixExpr(RegexExpr))
# RepoSet prefix
reposet:repo1,repo2
@ -214,21 +174,13 @@ content:hello
Program(PrefixExpr(ContentExpr))
# Branch with slashes
# Revision with slashes
branch:feature/new-feature
rev:feature/new-feature
==>
Program(PrefixExpr(BranchExpr))
# Case values
case:auto
==>
Program(PrefixExpr(CaseExpr))
Program(PrefixExpr(RevisionExpr))
# RepoSet with multiple repos
@ -246,14 +198,6 @@ sym:package.Class.method
Program(PrefixExpr(SymExpr))
# Type variations
type:filename
==>
Program(PrefixExpr(TypeExpr))
# Lang with various languages
lang:python
@ -278,21 +222,13 @@ fork:no
Program(PrefixExpr(ForkExpr))
# Public values
# Visibility prefix - private
public:no
visibility:private
==>
Program(PrefixExpr(PublicExpr))
# Regex with complex pattern
regex:\w+\s*=\s*\d+
==>
Program(PrefixExpr(RegexExpr))
Program(PrefixExpr(VisibilityExpr))
# File with dashes

View file

@ -102,13 +102,13 @@ content:"console.log"
Program(PrefixExpr(ContentExpr))
# Quoted string in branch prefix
# Quoted string in revision prefix
branch:"feature/my feature"
rev:"feature/my feature"
==>
Program(PrefixExpr(BranchExpr))
Program(PrefixExpr(RevisionExpr))
# Multiple quoted strings
@ -286,22 +286,6 @@ content:"TODO: fix this"
Program(PrefixExpr(ContentExpr))
# Regex prefix with quoted pattern
regex:"func\\s+\\w+"
==>
Program(PrefixExpr(RegexExpr))
# Case prefix with quoted value
case:"yes"
==>
Program(PrefixExpr(CaseExpr))
# Quoted string with at symbol
"@decorator"
@ -486,9 +470,9 @@ Program(AndExpr(Term,PrefixExpr(FileExpr)))
Program(Term)
# Quoted branch prefix
# Quoted revision prefix
"branch:main"
"rev:main"
==>

View file

@ -16,7 +16,7 @@ import { PrismaClient, Repo } from '@sourcebot/db';
import { createLogger, env } from '@sourcebot/shared';
import { NextRequest } from 'next/server';
import * as path from 'path';
import { parser } from '@sourcebot/query-language';
import { parser as _parser } from '@sourcebot/query-language';
import { transformToZoektQuery } from './transformer';
const logger = createLogger('streamSearchApi');
@ -69,13 +69,38 @@ export const POST = async (request: NextRequest) => {
const { query, matches, contextLines, whole } = parsed.data;
const isCaseSensitivityEnabled = false;
const isRegexEnabled = false;
const parser = _parser.configure({
strict: true,
})
const tree = parser.parse(query);
const zoektQuery = transformToZoektQuery(tree, query);
const zoektQuery = transformToZoektQuery({
tree,
input: query,
isCaseSensitivityEnabled,
isRegexEnabled,
});
console.log(JSON.stringify(zoektQuery, null, 2));
const searchRequest: SearchRequest = {
query: zoektQuery,
query: {
and: {
children: [
zoektQuery,
// {
// raw_config: {
// flags: [
// 'FLAG_NO_FORKS',
// ]
// }
// }
]
}
},
// query: {
// and: {
// // @todo: we should use repo_ids to filter out repositories that the user

View file

@ -1,221 +1,269 @@
import { Tree, SyntaxNode } from "@sourcebot/query-language";
import { Q } from '@/proto/zoekt/webserver/v1/Q';
import { Q } from '@/proto/zoekt/webserver/v1/Q';
import {
Program,
AndExpr,
OrExpr,
NegateExpr,
ParenExpr,
PrefixExpr,
Term,
FileExpr,
RepoExpr,
RevisionExpr,
ContentExpr,
LangExpr,
SymExpr,
ArchivedExpr,
ForkExpr,
VisibilityExpr,
RepoSetExpr
} from '@sourcebot/query-language';
/**
* Transform a Lezer parse tree into a Zoekt gRPC query
*/
export function transformToZoektQuery(tree: Tree, input: string): Q {
return transformNode(tree.topNode, input);
export const transformToZoektQuery = ({
tree,
input,
isCaseSensitivityEnabled,
isRegexEnabled,
}: {
tree: Tree;
input: string;
isCaseSensitivityEnabled: boolean;
isRegexEnabled: boolean;
}): Q => {
const transformNode = (node: SyntaxNode): Q => {
switch (node.type.id) {
case Program: {
// Program wraps the actual query - transform its child
const child = node.firstChild;
if (!child) {
// Empty query - match nothing
return { const: false, query: "const" };
}
return transformNode(child);
}
case AndExpr:
return {
and: {
children: getChildren(node).map(c => transformNode(c))
},
query: "and"
}
case OrExpr:
return {
or: {
children: getChildren(node).map(c => transformNode(c))
},
query: "or"
};
case NegateExpr: {
// Find the child after the negate token
const negateChild = node.getChild("PrefixExpr") || node.getChild("ParenExpr");
if (!negateChild) {
throw new Error("NegateExpr missing child");
}
return {
not: {
child: transformNode(negateChild)
},
query: "not"
};
}
case ParenExpr: {
// Parentheses just group - transform the inner query
const innerQuery = node.getChild("query") || node.firstChild;
if (!innerQuery) {
return { const: false, query: "const" };
}
return transformNode(innerQuery);
}
case PrefixExpr:
// PrefixExpr contains specific prefix types
return transformPrefixExpr(node);
case Term: {
const termText = input.substring(node.from, node.to).replace(/^"|"$/g, '');
return isRegexEnabled ? {
regexp: {
regexp: termText,
case_sensitive: isCaseSensitivityEnabled,
file_name: false,
content: true
},
query: "regexp"
} : {
substring: {
pattern: termText,
case_sensitive: isCaseSensitivityEnabled,
file_name: false,
content: true
},
query: "substring"
};
}
default:
console.warn(`Unhandled node type: ${node.type.name} (id: ${node.type.id})`);
return { const: true, query: "const" };
}
}
const transformPrefixExpr = (node: SyntaxNode): Q => {
// Find which specific prefix type this is
const prefixNode = node.firstChild;
if (!prefixNode) {
throw new Error("PrefixExpr has no child");
}
const prefixTypeId = prefixNode.type.id;
// Extract the full text (e.g., "file:test.js") and split on the colon
const fullText = input.substring(prefixNode.from, prefixNode.to);
const colonIndex = fullText.indexOf(':');
if (colonIndex === -1) {
throw new Error(`${prefixNode.type.name} missing colon`);
}
// Get the value part after the colon and remove quotes if present
const value = fullText.substring(colonIndex + 1).replace(/^"|"$/g, '');
switch (prefixTypeId) {
case FileExpr:
return {
substring: {
pattern: value,
case_sensitive: isCaseSensitivityEnabled,
file_name: true,
content: false
},
query: "substring"
};
case RepoExpr:
return {
repo: {
regexp: value
},
query: "repo"
};
case RevisionExpr:
return {
branch: {
pattern: value,
exact: false
},
query: "branch"
};
case ContentExpr:
return {
substring: {
pattern: value,
case_sensitive: isCaseSensitivityEnabled,
file_name: false,
content: true
},
query: "substring"
};
case LangExpr:
return {
language: {
language: value
},
query: "language"
};
case SymExpr:
// Symbol search wraps a pattern
return {
symbol: {
expr: {
substring: {
pattern: value,
case_sensitive: isCaseSensitivityEnabled,
file_name: false,
content: true
},
query: "substring"
}
},
query: "symbol"
};
case VisibilityExpr: {
const visibilityValue = value.toLowerCase();
const flags: ('FLAG_ONLY_PUBLIC' | 'FLAG_ONLY_PRIVATE')[] = [];
if (visibilityValue === 'public') {
flags.push('FLAG_ONLY_PUBLIC');
} else if (visibilityValue === 'private') {
flags.push('FLAG_ONLY_PRIVATE');
}
// 'any' means no filter
return {
raw_config: {
flags
},
query: "raw_config"
};
}
// @todo: handle this
case ArchivedExpr: {
const archivedValue = value.toLowerCase();
const flags: ('FLAG_ONLY_ARCHIVED' | 'FLAG_NO_ARCHIVED')[] = [];
if (archivedValue === 'yes') {
// 'yes' means include archived repositories (default)
} else if (archivedValue === 'no') {
flags.push('FLAG_NO_ARCHIVED');
} else if (archivedValue === 'only') {
flags.push('FLAG_ONLY_ARCHIVED');
}
return {
raw_config: {
flags
},
query: "raw_config"
};
}
case ForkExpr:
// These are repo metadata filters
// They need to be handled via repo filters in Zoekt
// For now, return a const query (you might need custom handling)
console.warn(`${prefixNode.type.name} not yet implemented`);
return { const: true, query: "const" };
case RepoSetExpr: {
return {
repo_set: {
set: value.split(',').reduce((acc, s) => {
acc[s.trim()] = true;
return acc;
}, {} as Record<string, boolean>)
},
query: "repo_set"
};
}
default:
throw new Error(`Unknown prefix type: ${prefixNode.type.name} (id: ${prefixTypeId})`);
}
}
return transformNode(tree.topNode);
}
function transformNode(node: SyntaxNode, input: string): Q {
const nodeName = node.type.name;
switch (nodeName) {
case "Program": {
// Program wraps the actual query - transform its child
const child = node.firstChild;
if (!child) {
// Empty query - match nothing
return { const: false, query: "const" };
}
return transformNode(child, input);
}
case "AndExpr":
return {
and: {
children: getChildren(node).map(c => transformNode(c, input))
},
query: "and"
}
case "OrExpr":
return {
or: {
children: getChildren(node).map(c => transformNode(c, input))
},
query: "or"
};
case "NegateExpr": {
// Find the child after the negate token
const negateChild = node.getChild("PrefixExpr") || node.getChild("ParenExpr");
if (!negateChild) {
throw new Error("NegateExpr missing child");
}
return {
not: {
child: transformNode(negateChild, input)
},
query: "not"
};
}
case "ParenExpr": {
// Parentheses just group - transform the inner query
const innerQuery = node.getChild("query") || node.firstChild;
if (!innerQuery) {
return { const: false, query: "const" };
}
return transformNode(innerQuery, input);
}
case "PrefixExpr":
// PrefixExpr contains specific prefix types
return transformPrefixExpr(node, input);
case "Term": {
// Plain search term - becomes substring search in content
const termText = input.substring(node.from, node.to);
return {
substring: {
pattern: termText.replace(/^"|"$/g, ''), // Remove quotes if present
case_sensitive: false,
file_name: false,
content: true
},
query: "substring"
};
}
default:
console.warn(`Unhandled node type: ${nodeName}`);
return { const: true, query: "const" };
}
}
function transformPrefixExpr(node: SyntaxNode, input: string): Q {
// Find which specific prefix type this is
const prefixNode = node.firstChild;
if (!prefixNode) {
throw new Error("PrefixExpr has no child");
}
const prefixType = prefixNode.type.name;
// Extract the full text (e.g., "file:test.js") and split on the colon
const fullText = input.substring(prefixNode.from, prefixNode.to);
const colonIndex = fullText.indexOf(':');
if (colonIndex === -1) {
throw new Error(`${prefixType} missing colon`);
}
// Get the value part after the colon and remove quotes if present
const value = fullText.substring(colonIndex + 1).replace(/^"|"$/g, '');
switch (prefixType) {
case "FileExpr":
return {
substring: {
pattern: value,
case_sensitive: false,
file_name: true,
content: false
},
query: "substring"
};
case "RepoExpr":
return {
repo: {
regexp: value
},
query: "repo"
};
case "BranchExpr":
return {
branch: {
pattern: value,
exact: false
},
query: "branch"
};
case "ContentExpr":
return {
substring: {
pattern: value,
case_sensitive: false,
file_name: false,
content: true
},
query: "substring"
};
case "CaseExpr": {
// case:yes/no wraps the next term with case sensitivity
const caseValue = value.toLowerCase();
const isCaseSensitive = caseValue === "yes" || caseValue === "true";
return {
substring: {
pattern: value,
case_sensitive: isCaseSensitive,
file_name: false,
content: true
},
query: "substring"
};
}
case "LangExpr":
return {
language: {
language: value
},
query: "language"
};
case "SymExpr":
// Symbol search wraps a pattern
return {
symbol: {
expr: {
substring: {
pattern: value,
case_sensitive: false,
file_name: false,
content: true
},
query: "substring"
}
},
query: "symbol"
};
case "RegexExpr":
return {
regexp: {
regexp: value,
case_sensitive: false,
file_name: false,
content: true
},
query: "regexp"
};
// @todo: handle this
case "ArchivedExpr":
case "ForkExpr":
case "PublicExpr":
// These are repo metadata filters
// They need to be handled via repo filters in Zoekt
// For now, return a const query (you might need custom handling)
console.warn(`${prefixType} not yet implemented`);
return { const: true, query: "const" };
case "RepoSetExpr": {
return {
repo_set: {
set: value.split(',').reduce((acc, s) => {
acc[s.trim()] = true;
return acc;
}, {} as Record<string, boolean>)
},
query: "repo_set"
};
}
default:
throw new Error(`Unknown prefix type: ${prefixType}`);
}
}
function getChildren(node: SyntaxNode): SyntaxNode[] {
const getChildren = (node: SyntaxNode): SyntaxNode[] => {
const children: SyntaxNode[] = [];
let child = node.firstChild;
while (child) {