pr-agent/pr_agent/static_analysis/src/file_summary.py
2024-10-02 08:56:33 +03:00

153 lines
5 KiB
Python

import os
from pathlib import Path
from grep_ast import TreeContext
from grep_ast.parsers import PARSERS
# from pygments.lexers import guess_lexer_for_filename
# from pygments.token import Token
from tree_sitter_languages import get_language, get_parser
class FileSummary:
"""
This class is used to summarize the content of a file using tree-sitter queries.
Supported languages: C, C++, C#, elisp, elixir, go, java, javascript, ocaml, php, python, ql, ruby, rust, typescript
"""
def __init__(self, fname_full_path: str, project_base_path, parent_context=True, child_context=False, header_max=0):
self.fname_full_path = fname_full_path
self.project_base_path = project_base_path
self.fname_rel = os.path.relpath(fname_full_path, project_base_path)
self.main_queries_path = Path(__file__).parent.parent / 'queries'
if not os.path.exists(fname_full_path):
print(f"File {fname_full_path} does not exist")
with open(fname_full_path, "r") as f:
code = f.read()
if not code.endswith("\n"):
code += "\n"
self.code = code
self.parent_context = parent_context
self.child_context = child_context
self.header_max = header_max
def summarize(self):
query_results = self.get_query_results()
summary_str = self.query_processing(query_results)
return summary_str
def render_tree_sitter(self, lines_of_interest: list):
code = self.code
fname_rel = self.fname_rel
context = TreeContext(
fname_rel,
code,
color=False,
line_number=True, # number the lines (1-indexed)
parent_context=self.parent_context,
child_context=self.child_context,
last_line=False,
margin=0,
mark_lois=False,
loi_pad=0,
header_max=self.header_max, # max number of lines to show in a function header
show_top_of_file_parent_scope=False,
)
context.lines_of_interest = set()
context.add_lines_of_interest(lines_of_interest)
context.add_context()
res = context.format()
return res
def query_processing(self, query_results: list):
if not query_results:
return ""
output = ""
def_lines = [q['line'] for q in query_results if q['kind'] == "def"]
output += "\n"
output += query_results[0]['fname'] + ":\n"
output += self.render_tree_sitter(def_lines)
return output
def get_queries_scheme(self, lang) -> str:
# Load the relevant queries
try:
path = os.path.join(self.main_queries_path, f"tree-sitter-{lang}-tags.scm")
with open(path, "r") as f:
return f.read()
except KeyError:
return ""
def filename_to_lang(self, filename):
file_extension = os.path.splitext(filename)[1]
lang = PARSERS.get(file_extension)
return lang
def get_query_results(self):
fname_rel = self.fname_rel
code = self.code
lang = self.filename_to_lang(fname_rel)
if not lang:
return
try:
language = get_language(lang)
parser = get_parser(lang)
except Exception as err:
print(f"Skipping file {fname_rel}: {err}")
return
query_scheme_str = self.get_queries_scheme(lang)
tree = parser.parse(bytes(code, "utf-8"))
# Run the queries
query = language.query(query_scheme_str)
captures = list(query.captures(tree.root_node))
# Parse the results into a list of "def" and "ref" tags
visited_set = set()
results = []
for node, tag in captures:
if tag.startswith("name.definition."):
kind = "def"
elif tag.startswith("name.reference."):
kind = "ref"
else:
continue
visited_set.add(kind)
result = dict(
fname=fname_rel,
name=node.text.decode("utf-8"),
kind=kind,
line=node.start_point[0],
)
results.append(result)
if "ref" in visited_set:
return results
if "def" not in visited_set:
return results
## currently we are interested only in defs
# # We saw defs, without any refs
# # Some files only provide defs (cpp, for example)
# # Use pygments to backfill refs
# try:
# lexer = guess_lexer_for_filename(fname, code)
# except Exception:
# return
#
# tokens = list(lexer.get_tokens(code))
# tokens = [token[1] for token in tokens if token[0] in Token.Name]
#
# for token in tokens:
# result = dict(
# fname=fname,
# name=token,
# kind="ref",
# line=-1,
# )
# results.append(result)
return results