diff --git a/.gitignore b/.gitignore index 8da05107..af44f249 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,5 @@ build/ docs/.cache/ .qodo poetry.lock +.pr_agent.toml +.claude/skills/SETUP_COMPLETE.md diff --git a/pr_agent/git_providers/__init__.py b/pr_agent/git_providers/__init__.py index 055cdbf1..668e8096 100644 --- a/pr_agent/git_providers/__init__.py +++ b/pr_agent/git_providers/__init__.py @@ -1,29 +1,12 @@ from starlette_context import context from pr_agent.config_loader import get_settings -from pr_agent.git_providers.azuredevops_provider import AzureDevopsProvider -from pr_agent.git_providers.bitbucket_provider import BitbucketProvider -from pr_agent.git_providers.bitbucket_server_provider import \ - BitbucketServerProvider -from pr_agent.git_providers.codecommit_provider import CodeCommitProvider -from pr_agent.git_providers.gerrit_provider import GerritProvider from pr_agent.git_providers.git_provider import GitProvider -from pr_agent.git_providers.gitea_provider import GiteaProvider from pr_agent.git_providers.github_provider import GithubProvider -from pr_agent.git_providers.gitlab_provider import GitLabProvider -from pr_agent.git_providers.local_git_provider import LocalGitProvider -from pr_agent.git_providers.gitea_provider import GiteaProvider +# Only GitHub provider - other providers removed _GIT_PROVIDERS = { 'github': GithubProvider, - 'gitlab': GitLabProvider, - 'bitbucket': BitbucketProvider, - 'bitbucket_server': BitbucketServerProvider, - 'azure': AzureDevopsProvider, - 'codecommit': CodeCommitProvider, - 'local': LocalGitProvider, - 'gerrit': GerritProvider, - 'gitea': GiteaProvider } diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml index 5e21b4f8..c746f9e3 100644 --- a/pr_agent/settings/configuration.toml +++ b/pr_agent/settings/configuration.toml @@ -343,9 +343,9 @@ service_callback = [] [pr_similar_issue] skip_comments = false -force_update_dataset = false max_issues_to_scan = 500 -vectordb = "pinecone" # options: "pinecone", "lancedb", "qdrant" +number_of_similar_issues = 5 # Number of similar issues to return +min_similarity_score = 60 # Minimum fuzzy match score (0-100) to consider an issue similar [pr_find_similar_component] class_name = "" @@ -355,18 +355,8 @@ allow_fallback_less_words = true number_of_keywords = 5 number_of_results = 5 -[pinecone] -# fill and place in .secrets.toml -#api_key = ... -# environment = "gcp-starter" - -[lancedb] -uri = "./lancedb" - -[qdrant] -# fill and place credentials in .secrets.toml -# url = "https://YOUR-QDRANT-URL" -# api_key = "..." +# Vector database configuration removed - now using rapidfuzz fuzzy matching +# See [pr_similar_issue] section for fuzzy matching configuration [best_practices] content = "" diff --git a/pr_agent/tools/pr_similar_issue.py b/pr_agent/tools/pr_similar_issue.py index 7a97d85b..7772880d 100644 --- a/pr_agent/tools/pr_similar_issue.py +++ b/pr_agent/tools/pr_similar_issue.py @@ -1,692 +1,247 @@ +""" +PR Similar Issue Finder - Simplified with Fuzzy Matching + +Uses rapidfuzz for fast, local fuzzy text matching instead of vector embeddings. +No external APIs or databases required. +""" + import time -from enum import Enum -from typing import List +from typing import List, Tuple, Dict +from rapidfuzz import fuzz, process -import openai -from pydantic import BaseModel, Field - -from pr_agent.algo import MAX_TOKENS -from pr_agent.algo.token_handler import TokenHandler -from pr_agent.algo.utils import get_max_tokens from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider from pr_agent.log import get_logger -MODEL = "text-embedding-ada-002" - class PRSimilarIssue: - def __init__(self, issue_url: str, ai_handler, args: list = None): + """ + Find similar issues using fuzzy text matching. + + Replaces vector-based search (Pinecone/LanceDB/Qdrant + OpenAI embeddings) + with simple, fast fuzzy matching using rapidfuzz. + """ + + def __init__(self, issue_url: str, ai_handler=None, args: list = None): + """Initialize the similar issue finder.""" if get_settings().config.git_provider != "github": raise Exception("Only github is supported for similar issue tool") self.cli_mode = get_settings().CONFIG.CLI_MODE self.max_issues_to_scan = get_settings().pr_similar_issue.max_issues_to_scan + self.number_of_similar_issues = get_settings().pr_similar_issue.get( + 'number_of_similar_issues', 5 + ) + self.min_similarity_score = get_settings().pr_similar_issue.get( + 'min_similarity_score', 60 + ) + self.skip_comments = get_settings().pr_similar_issue.get( + 'skip_comments', False + ) + self.issue_url = issue_url self.git_provider = get_git_provider()() - repo_name, issue_number = self.git_provider._parse_issue_url(issue_url.split('=')[-1]) + + # Parse issue URL + repo_name, issue_number = self.git_provider._parse_issue_url( + issue_url.split('=')[-1] + ) self.git_provider.repo = repo_name self.git_provider.repo_obj = self.git_provider.github_client.get_repo(repo_name) - self.token_handler = TokenHandler() - repo_obj = self.git_provider.repo_obj - repo_name_for_index = self.repo_name_for_index = repo_obj.full_name.lower().replace('/', '-').replace('_/', '-') - index_name = self.index_name = "codium-ai-pr-agent-issues" + self.query_issue_number = issue_number - if get_settings().pr_similar_issue.vectordb == "pinecone": - try: - import pandas as pd - import pinecone - from pinecone_datasets import Dataset, DatasetMetadata - except: - raise Exception("Please install 'pinecone' and 'pinecone_datasets' to use pinecone as vectordb") - # assuming pinecone api key and environment are set in secrets file - try: - api_key = get_settings().pinecone.api_key - environment = get_settings().pinecone.environment - except Exception: - if not self.cli_mode: - repo_name, original_issue_number = self.git_provider._parse_issue_url(self.issue_url.split('=')[-1]) - issue_main = self.git_provider.repo_obj.get_issue(original_issue_number) - issue_main.create_comment("Please set pinecone api key and environment in secrets file") - raise Exception("Please set pinecone api key and environment in secrets file") - - # check if index exists, and if repo is already indexed - run_from_scratch = False - if run_from_scratch: # for debugging - pinecone.init(api_key=api_key, environment=environment) - if index_name in pinecone.list_indexes(): - get_logger().info('Removing index...') - pinecone.delete_index(index_name) - get_logger().info('Done') - - upsert = True - pinecone.init(api_key=api_key, environment=environment) - if not index_name in pinecone.list_indexes(): - run_from_scratch = True - upsert = False - else: - if get_settings().pr_similar_issue.force_update_dataset: - upsert = True - else: - pinecone_index = pinecone.Index(index_name=index_name) - res = pinecone_index.fetch([f"example_issue_{repo_name_for_index}"]).to_dict() - if res["vectors"]: - upsert = False - - if run_from_scratch or upsert: # index the entire repo - get_logger().info('Indexing the entire repo...') - - get_logger().info('Getting issues...') - issues = list(repo_obj.get_issues(state='all')) - get_logger().info('Done') - self._update_index_with_issues(issues, repo_name_for_index, upsert=upsert) - else: # update index if needed - pinecone_index = pinecone.Index(index_name=index_name) - issues_to_update = [] - issues_paginated_list = repo_obj.get_issues(state='all') - counter = 1 - for issue in issues_paginated_list: - if issue.pull_request: - continue - issue_str, comments, number = self._process_issue(issue) - issue_key = f"issue_{number}" - id = issue_key + "." + "issue" - res = pinecone_index.fetch([id]).to_dict() - is_new_issue = True - for vector in res["vectors"].values(): - if vector['metadata']['repo'] == repo_name_for_index: - is_new_issue = False - break - if is_new_issue: - counter += 1 - issues_to_update.append(issue) - else: - break - - if issues_to_update: - get_logger().info(f'Updating index with {counter} new issues...') - self._update_index_with_issues(issues_to_update, repo_name_for_index, upsert=True) - else: - get_logger().info('No new issues to update') - - elif get_settings().pr_similar_issue.vectordb == "lancedb": - try: - import lancedb # import lancedb only if needed - except: - raise Exception("Please install lancedb to use lancedb as vectordb") - self.db = lancedb.connect(get_settings().lancedb.uri) - self.table = None - - run_from_scratch = False - if run_from_scratch: # for debugging - if index_name in self.db.table_names(): - get_logger().info('Removing Table...') - self.db.drop_table(index_name) - get_logger().info('Done') - - ingest = True - if index_name not in self.db.table_names(): - run_from_scratch = True - ingest = False - else: - if get_settings().pr_similar_issue.force_update_dataset: - ingest = True - else: - self.table = self.db[index_name] - res = self.table.search().limit(len(self.table)).where(f"id='example_issue_{repo_name_for_index}'").to_list() - get_logger().info("result: ", res) - if res[0].get("vector"): - ingest = False - - if run_from_scratch or ingest: # indexing the entire repo - get_logger().info('Indexing the entire repo...') - - get_logger().info('Getting issues...') - issues = list(repo_obj.get_issues(state='all')) - get_logger().info('Done') - - self._update_table_with_issues(issues, repo_name_for_index, ingest=ingest) - else: # update table if needed - issues_to_update = [] - issues_paginated_list = repo_obj.get_issues(state='all') - counter = 1 - for issue in issues_paginated_list: - if issue.pull_request: - continue - issue_str, comments, number = self._process_issue(issue) - issue_key = f"issue_{number}" - issue_id = issue_key + "." + "issue" - res = self.table.search().limit(len(self.table)).where(f"id='{issue_id}'").to_list() - is_new_issue = True - for r in res: - if r['metadata']['repo'] == repo_name_for_index: - is_new_issue = False - break - if is_new_issue: - counter += 1 - issues_to_update.append(issue) - else: - break - - if issues_to_update: - get_logger().info(f'Updating index with {counter} new issues...') - self._update_table_with_issues(issues_to_update, repo_name_for_index, ingest=True) - else: - get_logger().info('No new issues to update') - - elif get_settings().pr_similar_issue.vectordb == "qdrant": - try: - import qdrant_client - from qdrant_client.models import (Distance, FieldCondition, - Filter, MatchValue, - PointStruct, VectorParams) - except Exception: - raise Exception("Please install qdrant-client to use qdrant as vectordb") - - api_key = None - url = None - try: - api_key = get_settings().qdrant.api_key - url = get_settings().qdrant.url - except Exception: - if not self.cli_mode: - repo_name, original_issue_number = self.git_provider._parse_issue_url(self.issue_url.split('=')[-1]) - issue_main = self.git_provider.repo_obj.get_issue(original_issue_number) - issue_main.create_comment("Please set qdrant url and api key in secrets file") - raise Exception("Please set qdrant url and api key in secrets file") - - self.qdrant = qdrant_client.QdrantClient(url=url, api_key=api_key) - - run_from_scratch = False - ingest = True - - if not self.qdrant.collection_exists(collection_name=self.index_name): - run_from_scratch = True - ingest = False - self.qdrant.create_collection( - collection_name=self.index_name, - vectors_config=VectorParams(size=1536, distance=Distance.COSINE), - ) - else: - if get_settings().pr_similar_issue.force_update_dataset: - ingest = True - else: - response = self.qdrant.count( - collection_name=self.index_name, - count_filter=Filter(must=[ - FieldCondition(key="metadata.repo", match=MatchValue(value=repo_name_for_index)), - FieldCondition(key="id", match=MatchValue(value=f"example_issue_{repo_name_for_index}")), - ]), - ) - ingest = True if response.count == 0 else False - - if run_from_scratch or ingest: - get_logger().info('Indexing the entire repo...') - get_logger().info('Getting issues...') - issues = list(repo_obj.get_issues(state='all')) - get_logger().info('Done') - self._update_qdrant_with_issues(issues, repo_name_for_index, ingest=ingest) - else: - issues_to_update = [] - issues_paginated_list = repo_obj.get_issues(state='all') - counter = 1 - for issue in issues_paginated_list: - if issue.pull_request: - continue - issue_str, comments, number = self._process_issue(issue) - issue_key = f"issue_{number}" - point_id = issue_key + "." + "issue" - response = self.qdrant.count( - collection_name=self.index_name, - count_filter=Filter(must=[ - FieldCondition(key="id", match=MatchValue(value=point_id)), - FieldCondition(key="metadata.repo", match=MatchValue(value=repo_name_for_index)), - ]), - ) - if response.count == 0: - counter += 1 - issues_to_update.append(issue) - else: - break - - if issues_to_update: - get_logger().info(f'Updating index with {counter} new issues...') - self._update_qdrant_with_issues(issues_to_update, repo_name_for_index, ingest=True) - else: - get_logger().info('No new issues to update') + # In-memory cache for issues + self.issues_cache: Dict[int, Dict[str, str]] = {} + get_logger().info(f"Initialized PRSimilarIssue for {repo_name} issue #{issue_number}") async def run(self): - get_logger().info('Getting issue...') - repo_name, original_issue_number = self.git_provider._parse_issue_url(self.issue_url.split('=')[-1]) - issue_main = self.git_provider.repo_obj.get_issue(original_issue_number) - issue_str, comments, number = self._process_issue(issue_main) - openai.api_key = get_settings().openai.key - get_logger().info('Done') + """Main execution method - find and post similar issues.""" + try: + get_logger().info("Starting similar issue search...") - get_logger().info('Querying...') - res = openai.Embedding.create(input=[issue_str], engine=MODEL) - embeds = [record['embedding'] for record in res['data']] + # 1. Fetch all issues from GitHub + get_logger().info("Fetching issues from GitHub...") + repo_obj = self.git_provider.repo_obj + issues_list = list(repo_obj.get_issues(state='all')) + get_logger().info(f"Found {len(issues_list)} total issues") - relevant_issues_number_list = [] - relevant_comment_number_list = [] - score_list = [] + # 2. Index issues in memory + get_logger().info("Indexing issues...") + self._index_issues(issues_list) - if get_settings().pr_similar_issue.vectordb == "pinecone": - pinecone_index = pinecone.Index(index_name=self.index_name) - res = pinecone_index.query(embeds[0], - top_k=5, - filter={"repo": self.repo_name_for_index}, - include_metadata=True).to_dict() + # 3. Get query issue details + query_issue = repo_obj.get_issue(self.query_issue_number) + query_title = query_issue.title + query_body = query_issue.body or "" - for r in res['matches']: - # skip example issue - if 'example_issue_' in r["id"]: - continue + get_logger().info(f"Query issue: {query_title}") - try: - issue_number = int(r["id"].split('.')[0].split('_')[-1]) - except: - get_logger().debug(f"Failed to parse issue number from {r['id']}") - continue - - if original_issue_number == issue_number: - continue - if issue_number not in relevant_issues_number_list: - relevant_issues_number_list.append(issue_number) - if 'comment' in r["id"]: - relevant_comment_number_list.append(int(r["id"].split('.')[1].split('_')[-1])) - else: - relevant_comment_number_list.append(-1) - score_list.append(str("{:.2f}".format(r['score']))) - get_logger().info('Done') - - elif get_settings().pr_similar_issue.vectordb == "lancedb": - res = self.table.search(embeds[0]).where(f"metadata.repo='{self.repo_name_for_index}'", prefilter=True).to_list() - - for r in res: - # skip example issue - if 'example_issue_' in r["id"]: - continue - - try: - issue_number = int(r["id"].split('.')[0].split('_')[-1]) - except: - get_logger().debug(f"Failed to parse issue number from {r['id']}") - continue - - if original_issue_number == issue_number: - continue - if issue_number not in relevant_issues_number_list: - relevant_issues_number_list.append(issue_number) - - if 'comment' in r["id"]: - relevant_comment_number_list.append(int(r["id"].split('.')[1].split('_')[-1])) - else: - relevant_comment_number_list.append(-1) - score_list.append(str("{:.2f}".format(1-r['_distance']))) - get_logger().info('Done') - - elif get_settings().pr_similar_issue.vectordb == "qdrant": - from qdrant_client.models import FieldCondition, Filter, MatchValue - res = self.qdrant.search( - collection_name=self.index_name, - query_vector=embeds[0], - limit=5, - query_filter=Filter(must=[FieldCondition(key="metadata.repo", match=MatchValue(value=self.repo_name_for_index))]), - with_payload=True, + # 4. Find similar issues using fuzzy matching + get_logger().info("Finding similar issues...") + similar_issues = self._find_similar( + query_title=query_title, + query_body=query_body, + skip_issue_number=self.query_issue_number, + top_k=self.number_of_similar_issues ) - for r in res: - rid = r.payload.get("id", "") - if 'example_issue_' in rid: - continue - try: - issue_number = int(rid.split('.')[0].split('_')[-1]) - except Exception: - get_logger().debug(f"Failed to parse issue number from {rid}") - continue - if original_issue_number == issue_number: - continue - if issue_number not in relevant_issues_number_list: - relevant_issues_number_list.append(issue_number) - if 'comment' in rid: - relevant_comment_number_list.append(int(rid.split('.')[1].split('_')[-1])) - else: - relevant_comment_number_list.append(-1) - score_list.append(str("{:.2f}".format(r.score))) - get_logger().info('Done') - - get_logger().info('Publishing response...') - similar_issues_str = "### Similar Issues\n___\n\n" - - for i, issue_number_similar in enumerate(relevant_issues_number_list): - issue = self.git_provider.repo_obj.get_issue(issue_number_similar) - title = issue.title - url = issue.html_url - if relevant_comment_number_list[i] != -1: - url = list(issue.get_comments())[relevant_comment_number_list[i]].html_url - similar_issues_str += f"{i + 1}. **[{title}]({url})** (score={score_list[i]})\n\n" - if get_settings().config.publish_output: - response = issue_main.create_comment(similar_issues_str) - get_logger().info(similar_issues_str) - get_logger().info('Done') - - def _process_issue(self, issue): - header = issue.title - body = issue.body - number = issue.number - if get_settings().pr_similar_issue.skip_comments: - comments = [] - else: - comments = list(issue.get_comments()) - issue_str = f"Issue Header: \"{header}\"\n\nIssue Body:\n{body}" - return issue_str, comments, number - - def _update_index_with_issues(self, issues_list, repo_name_for_index, upsert=False): - get_logger().info('Processing issues...') - corpus = Corpus() - example_issue_record = Record( - id=f"example_issue_{repo_name_for_index}", - text="example_issue", - metadata=Metadata(repo=repo_name_for_index) - ) - corpus.append(example_issue_record) - - counter = 0 - for issue in issues_list: - if issue.pull_request: - continue - - counter += 1 - if counter % 100 == 0: - get_logger().info(f"Scanned {counter} issues") - if counter >= self.max_issues_to_scan: - get_logger().info(f"Scanned {self.max_issues_to_scan} issues, stopping") - break - - issue_str, comments, number = self._process_issue(issue) - issue_key = f"issue_{number}" - username = issue.user.login - created_at = str(issue.created_at) - if len(issue_str) < 8000 or \ - self.token_handler.count_tokens(issue_str) < get_max_tokens(MODEL): # fast reject first - issue_record = Record( - id=issue_key + "." + "issue", - text=issue_str, - metadata=Metadata(repo=repo_name_for_index, - username=username, - created_at=created_at, - level=IssueLevel.ISSUE) - ) - corpus.append(issue_record) - if comments: - for j, comment in enumerate(comments): - comment_body = comment.body - num_words_comment = len(comment_body.split()) - if num_words_comment < 10 or not isinstance(comment_body, str): - continue - - if len(comment_body) < 8000 or \ - self.token_handler.count_tokens(comment_body) < MAX_TOKENS[MODEL]: - comment_record = Record( - id=issue_key + ".comment_" + str(j + 1), - text=comment_body, - metadata=Metadata(repo=repo_name_for_index, - username=username, # use issue username for all comments - created_at=created_at, - level=IssueLevel.COMMENT) - ) - corpus.append(comment_record) - df = pd.DataFrame(corpus.dict()["documents"]) - get_logger().info('Done') - - get_logger().info('Embedding...') - openai.api_key = get_settings().openai.key - list_to_encode = list(df["text"].values) - try: - res = openai.Embedding.create(input=list_to_encode, engine=MODEL) - embeds = [record['embedding'] for record in res['data']] - except: - embeds = [] - get_logger().error('Failed to embed entire list, embedding one by one...') - for i, text in enumerate(list_to_encode): - try: - res = openai.Embedding.create(input=[text], engine=MODEL) - embeds.append(res['data'][0]['embedding']) - except: - embeds.append([0] * 1536) - df["values"] = embeds - meta = DatasetMetadata.empty() - meta.dense_model.dimension = len(embeds[0]) - ds = Dataset.from_pandas(df, meta) - get_logger().info('Done') - - api_key = get_settings().pinecone.api_key - environment = get_settings().pinecone.environment - if not upsert: - get_logger().info('Creating index from scratch...') - ds.to_pinecone_index(self.index_name, api_key=api_key, environment=environment) - time.sleep(15) # wait for pinecone to finalize indexing before querying - else: - get_logger().info('Upserting index...') - namespace = "" - batch_size: int = 100 - concurrency: int = 10 - pinecone.init(api_key=api_key, environment=environment) - ds._upsert_to_index(self.index_name, namespace, batch_size, concurrency) - time.sleep(5) # wait for pinecone to finalize upserting before querying - get_logger().info('Done') - - def _update_table_with_issues(self, issues_list, repo_name_for_index, ingest=False): - get_logger().info('Processing issues...') - - corpus = Corpus() - example_issue_record = Record( - id=f"example_issue_{repo_name_for_index}", - text="example_issue", - metadata=Metadata(repo=repo_name_for_index) - ) - corpus.append(example_issue_record) - - counter = 0 - for issue in issues_list: - if issue.pull_request: - continue - - counter += 1 - if counter % 100 == 0: - get_logger().info(f"Scanned {counter} issues") - if counter >= self.max_issues_to_scan: - get_logger().info(f"Scanned {self.max_issues_to_scan} issues, stopping") - break - - issue_str, comments, number = self._process_issue(issue) - issue_key = f"issue_{number}" - username = issue.user.login - created_at = str(issue.created_at) - if len(issue_str) < 8000 or \ - self.token_handler.count_tokens(issue_str) < get_max_tokens(MODEL): # fast reject first - issue_record = Record( - id=issue_key + "." + "issue", - text=issue_str, - metadata=Metadata(repo=repo_name_for_index, - username=username, - created_at=created_at, - level=IssueLevel.ISSUE) - ) - corpus.append(issue_record) - if comments: - for j, comment in enumerate(comments): - comment_body = comment.body - num_words_comment = len(comment_body.split()) - if num_words_comment < 10 or not isinstance(comment_body, str): - continue - - if len(comment_body) < 8000 or \ - self.token_handler.count_tokens(comment_body) < MAX_TOKENS[MODEL]: - comment_record = Record( - id=issue_key + ".comment_" + str(j + 1), - text=comment_body, - metadata=Metadata(repo=repo_name_for_index, - username=username, # use issue username for all comments - created_at=created_at, - level=IssueLevel.COMMENT) - ) - corpus.append(comment_record) - df = pd.DataFrame(corpus.dict()["documents"]) - get_logger().info('Done') - - get_logger().info('Embedding...') - openai.api_key = get_settings().openai.key - list_to_encode = list(df["text"].values) - try: - res = openai.Embedding.create(input=list_to_encode, engine=MODEL) - embeds = [record['embedding'] for record in res['data']] - except: - embeds = [] - get_logger().error('Failed to embed entire list, embedding one by one...') - for i, text in enumerate(list_to_encode): - try: - res = openai.Embedding.create(input=[text], engine=MODEL) - embeds.append(res['data'][0]['embedding']) - except: - embeds.append([0] * 1536) - df["vector"] = embeds - get_logger().info('Done') - - if not ingest: - get_logger().info('Creating table from scratch...') - self.table = self.db.create_table(self.index_name, data=df, mode="overwrite") - time.sleep(15) - else: - get_logger().info('Ingesting in Table...') - if self.index_name not in self.db.table_names(): - self.table.add(df) + # 5. Post results + if similar_issues: + get_logger().info(f"Found {len(similar_issues)} similar issues") + self._post_results(query_issue, similar_issues) else: - get_logger().info(f"Table {self.index_name} doesn't exists!") - time.sleep(5) - get_logger().info('Done') + get_logger().info("No similar issues found above threshold") + if not get_settings().pr_similar_issue.get('skip_comments', False): + query_issue.create_comment("No similar issues found.") + return similar_issues - def _update_qdrant_with_issues(self, issues_list, repo_name_for_index, ingest=False): - try: - import uuid - - import pandas as pd - from qdrant_client.models import PointStruct - except Exception: + except Exception as e: + get_logger().error(f"Error in PRSimilarIssue.run(): {e}") raise - get_logger().info('Processing issues...') - corpus = Corpus() - example_issue_record = Record( - id=f"example_issue_{repo_name_for_index}", - text="example_issue", - metadata=Metadata(repo=repo_name_for_index) - ) - corpus.append(example_issue_record) + def _index_issues(self, issues_list: List) -> None: + """ + Index issues in memory for fast searching. + Args: + issues_list: List of GitHub issue objects + """ counter = 0 + for issue in issues_list: + # Skip pull requests if issue.pull_request: continue counter += 1 - if counter % 100 == 0: - get_logger().info(f"Scanned {counter} issues") if counter >= self.max_issues_to_scan: - get_logger().info(f"Scanned {self.max_issues_to_scan} issues, stopping") + get_logger().info(f"Reached max issues to scan: {self.max_issues_to_scan}") break - issue_str, comments, number = self._process_issue(issue) - issue_key = f"issue_{number}" - username = issue.user.login - created_at = str(issue.created_at) - if len(issue_str) < 8000 or \ - self.token_handler.count_tokens(issue_str) < get_max_tokens(MODEL): - issue_record = Record( - id=issue_key + "." + "issue", - text=issue_str, - metadata=Metadata(repo=repo_name_for_index, - username=username, - created_at=created_at, - level=IssueLevel.ISSUE) - ) - corpus.append(issue_record) - if comments: - for j, comment in enumerate(comments): - comment_body = comment.body - num_words_comment = len(comment_body.split()) - if num_words_comment < 10 or not isinstance(comment_body, str): - continue + # Extract issue content + title = issue.title + body = issue.body or "" - if len(comment_body) < 8000 or \ - self.token_handler.count_tokens(comment_body) < MAX_TOKENS[MODEL]: - comment_record = Record( - id=issue_key + ".comment_" + str(j + 1), - text=comment_body, - metadata=Metadata(repo=repo_name_for_index, - username=username, - created_at=created_at, - level=IssueLevel.COMMENT) - ) - corpus.append(comment_record) - - df = pd.DataFrame(corpus.dict()["documents"]) - get_logger().info('Done') - - get_logger().info('Embedding...') - openai.api_key = get_settings().openai.key - list_to_encode = list(df["text"].values) - try: - res = openai.Embedding.create(input=list_to_encode, engine=MODEL) - embeds = [record['embedding'] for record in res['data']] - except Exception: - embeds = [] - get_logger().error('Failed to embed entire list, embedding one by one...') - for i, text in enumerate(list_to_encode): + # Optionally include comments + comments_text = "" + if not self.skip_comments: try: - res = openai.Embedding.create(input=[text], engine=MODEL) - embeds.append(res['data'][0]['embedding']) - except Exception: - embeds.append([0] * 1536) - df["vector"] = embeds - get_logger().info('Done') + comments = list(issue.get_comments()) + comments_text = " ".join([c.body for c in comments if c.body]) + except: + pass # Comments not critical - get_logger().info('Upserting into Qdrant...') - points = [] - for row in df.to_dict(orient="records"): - points.append( - PointStruct(id=uuid.uuid5(uuid.NAMESPACE_DNS, row["id"]).hex, vector=row["vector"], payload={"id": row["id"], "text": row["text"], "metadata": row["metadata"]}) + # Store in cache + self.issues_cache[issue.number] = { + 'title': title, + 'body': body, + 'comments': comments_text, + 'url': issue.html_url, + 'state': issue.state, + } + + get_logger().info(f"Indexed {len(self.issues_cache)} issues") + + def _find_similar( + self, + query_title: str, + query_body: str, + skip_issue_number: int = None, + top_k: int = 5 + ) -> List[Tuple[float, int, str, str]]: + """ + Find similar issues using fuzzy text matching. + + Args: + query_title: Title of query issue + query_body: Body of query issue + skip_issue_number: Issue number to skip (the query issue itself) + top_k: Number of similar issues to return + + Returns: + List of tuples: (score, issue_number, title, url) + """ + # Build query string (weight title more by repeating it) + query_text = f"{query_title} {query_title} {query_body}" + + # Prepare choices for fuzzy matching + choices = {} + for issue_num, issue_data in self.issues_cache.items(): + # Skip the query issue itself + if skip_issue_number and issue_num == skip_issue_number: + continue + + # Build issue text (weight title 2x) + issue_text = ( + f"{issue_data['title']} {issue_data['title']} " + f"{issue_data['body']} {issue_data['comments']}" ) - self.qdrant.upsert(collection_name=self.index_name, points=points) - get_logger().info('Done') + choices[issue_num] = issue_text + if not choices: + get_logger().warning("No issues available for comparison") + return [] -class IssueLevel(str, Enum): - ISSUE = "issue" - COMMENT = "comment" + # Use rapidfuzz for fuzzy matching + # token_sort_ratio: handles word order differences well + results = process.extract( + query_text, + choices, + scorer=fuzz.token_sort_ratio, + limit=top_k * 2, # Get extra in case we need to filter + ) + # Filter by minimum score and format results + similar_issues = [] + for matched_text, score, issue_num in results: + if score >= self.min_similarity_score: + issue_data = self.issues_cache[issue_num] + similar_issues.append(( + score, + issue_num, + issue_data['title'], + issue_data['url'] + )) -class Metadata(BaseModel): - repo: str - username: str = Field(default="@codium") - created_at: str = Field(default="01-01-1970 00:00:00.00000") - level: IssueLevel = Field(default=IssueLevel.ISSUE) + # Stop once we have enough results + if len(similar_issues) >= top_k: + break - class Config: - use_enum_values = True + return similar_issues + def _post_results( + self, + query_issue, + similar_issues: List[Tuple[float, int, str, str]] + ) -> None: + """ + Post similar issues as a comment. -class Record(BaseModel): - id: str - text: str - metadata: Metadata + Args: + query_issue: GitHub issue object to comment on + similar_issues: List of (score, number, title, url) tuples + """ + # Build comment + comment_lines = ["### Similar Issues\n___\n"] + for i, (score, number, title, url) in enumerate(similar_issues, 1): + # Format score as percentage + score_pct = f"{score:.1f}%" + comment_lines.append( + f"{i}. **[{title}]({url})** (similarity: {score_pct})\n" + ) -class Corpus(BaseModel): - documents: List[Record] = Field(default=[]) + similar_issues_str = "\n".join(comment_lines) - def append(self, r: Record): - self.documents.append(r) + # Post comment (unless skip_comments is True) + if not get_settings().pr_similar_issue.get('skip_comments', False): + try: + query_issue.create_comment(similar_issues_str) + get_logger().info("Posted similar issues comment") + except Exception as e: + get_logger().error(f"Failed to post comment: {e}") + + # Always log results + get_logger().info(f"\n{similar_issues_str}") diff --git a/pr_agent/tools/pr_similar_issue.py.bak b/pr_agent/tools/pr_similar_issue.py.bak new file mode 100644 index 00000000..7a97d85b --- /dev/null +++ b/pr_agent/tools/pr_similar_issue.py.bak @@ -0,0 +1,692 @@ +import time +from enum import Enum +from typing import List + +import openai +from pydantic import BaseModel, Field + +from pr_agent.algo import MAX_TOKENS +from pr_agent.algo.token_handler import TokenHandler +from pr_agent.algo.utils import get_max_tokens +from pr_agent.config_loader import get_settings +from pr_agent.git_providers import get_git_provider +from pr_agent.log import get_logger + +MODEL = "text-embedding-ada-002" + + +class PRSimilarIssue: + def __init__(self, issue_url: str, ai_handler, args: list = None): + if get_settings().config.git_provider != "github": + raise Exception("Only github is supported for similar issue tool") + + self.cli_mode = get_settings().CONFIG.CLI_MODE + self.max_issues_to_scan = get_settings().pr_similar_issue.max_issues_to_scan + self.issue_url = issue_url + self.git_provider = get_git_provider()() + repo_name, issue_number = self.git_provider._parse_issue_url(issue_url.split('=')[-1]) + self.git_provider.repo = repo_name + self.git_provider.repo_obj = self.git_provider.github_client.get_repo(repo_name) + self.token_handler = TokenHandler() + repo_obj = self.git_provider.repo_obj + repo_name_for_index = self.repo_name_for_index = repo_obj.full_name.lower().replace('/', '-').replace('_/', '-') + index_name = self.index_name = "codium-ai-pr-agent-issues" + + if get_settings().pr_similar_issue.vectordb == "pinecone": + try: + import pandas as pd + import pinecone + from pinecone_datasets import Dataset, DatasetMetadata + except: + raise Exception("Please install 'pinecone' and 'pinecone_datasets' to use pinecone as vectordb") + # assuming pinecone api key and environment are set in secrets file + try: + api_key = get_settings().pinecone.api_key + environment = get_settings().pinecone.environment + except Exception: + if not self.cli_mode: + repo_name, original_issue_number = self.git_provider._parse_issue_url(self.issue_url.split('=')[-1]) + issue_main = self.git_provider.repo_obj.get_issue(original_issue_number) + issue_main.create_comment("Please set pinecone api key and environment in secrets file") + raise Exception("Please set pinecone api key and environment in secrets file") + + # check if index exists, and if repo is already indexed + run_from_scratch = False + if run_from_scratch: # for debugging + pinecone.init(api_key=api_key, environment=environment) + if index_name in pinecone.list_indexes(): + get_logger().info('Removing index...') + pinecone.delete_index(index_name) + get_logger().info('Done') + + upsert = True + pinecone.init(api_key=api_key, environment=environment) + if not index_name in pinecone.list_indexes(): + run_from_scratch = True + upsert = False + else: + if get_settings().pr_similar_issue.force_update_dataset: + upsert = True + else: + pinecone_index = pinecone.Index(index_name=index_name) + res = pinecone_index.fetch([f"example_issue_{repo_name_for_index}"]).to_dict() + if res["vectors"]: + upsert = False + + if run_from_scratch or upsert: # index the entire repo + get_logger().info('Indexing the entire repo...') + + get_logger().info('Getting issues...') + issues = list(repo_obj.get_issues(state='all')) + get_logger().info('Done') + self._update_index_with_issues(issues, repo_name_for_index, upsert=upsert) + else: # update index if needed + pinecone_index = pinecone.Index(index_name=index_name) + issues_to_update = [] + issues_paginated_list = repo_obj.get_issues(state='all') + counter = 1 + for issue in issues_paginated_list: + if issue.pull_request: + continue + issue_str, comments, number = self._process_issue(issue) + issue_key = f"issue_{number}" + id = issue_key + "." + "issue" + res = pinecone_index.fetch([id]).to_dict() + is_new_issue = True + for vector in res["vectors"].values(): + if vector['metadata']['repo'] == repo_name_for_index: + is_new_issue = False + break + if is_new_issue: + counter += 1 + issues_to_update.append(issue) + else: + break + + if issues_to_update: + get_logger().info(f'Updating index with {counter} new issues...') + self._update_index_with_issues(issues_to_update, repo_name_for_index, upsert=True) + else: + get_logger().info('No new issues to update') + + elif get_settings().pr_similar_issue.vectordb == "lancedb": + try: + import lancedb # import lancedb only if needed + except: + raise Exception("Please install lancedb to use lancedb as vectordb") + self.db = lancedb.connect(get_settings().lancedb.uri) + self.table = None + + run_from_scratch = False + if run_from_scratch: # for debugging + if index_name in self.db.table_names(): + get_logger().info('Removing Table...') + self.db.drop_table(index_name) + get_logger().info('Done') + + ingest = True + if index_name not in self.db.table_names(): + run_from_scratch = True + ingest = False + else: + if get_settings().pr_similar_issue.force_update_dataset: + ingest = True + else: + self.table = self.db[index_name] + res = self.table.search().limit(len(self.table)).where(f"id='example_issue_{repo_name_for_index}'").to_list() + get_logger().info("result: ", res) + if res[0].get("vector"): + ingest = False + + if run_from_scratch or ingest: # indexing the entire repo + get_logger().info('Indexing the entire repo...') + + get_logger().info('Getting issues...') + issues = list(repo_obj.get_issues(state='all')) + get_logger().info('Done') + + self._update_table_with_issues(issues, repo_name_for_index, ingest=ingest) + else: # update table if needed + issues_to_update = [] + issues_paginated_list = repo_obj.get_issues(state='all') + counter = 1 + for issue in issues_paginated_list: + if issue.pull_request: + continue + issue_str, comments, number = self._process_issue(issue) + issue_key = f"issue_{number}" + issue_id = issue_key + "." + "issue" + res = self.table.search().limit(len(self.table)).where(f"id='{issue_id}'").to_list() + is_new_issue = True + for r in res: + if r['metadata']['repo'] == repo_name_for_index: + is_new_issue = False + break + if is_new_issue: + counter += 1 + issues_to_update.append(issue) + else: + break + + if issues_to_update: + get_logger().info(f'Updating index with {counter} new issues...') + self._update_table_with_issues(issues_to_update, repo_name_for_index, ingest=True) + else: + get_logger().info('No new issues to update') + + elif get_settings().pr_similar_issue.vectordb == "qdrant": + try: + import qdrant_client + from qdrant_client.models import (Distance, FieldCondition, + Filter, MatchValue, + PointStruct, VectorParams) + except Exception: + raise Exception("Please install qdrant-client to use qdrant as vectordb") + + api_key = None + url = None + try: + api_key = get_settings().qdrant.api_key + url = get_settings().qdrant.url + except Exception: + if not self.cli_mode: + repo_name, original_issue_number = self.git_provider._parse_issue_url(self.issue_url.split('=')[-1]) + issue_main = self.git_provider.repo_obj.get_issue(original_issue_number) + issue_main.create_comment("Please set qdrant url and api key in secrets file") + raise Exception("Please set qdrant url and api key in secrets file") + + self.qdrant = qdrant_client.QdrantClient(url=url, api_key=api_key) + + run_from_scratch = False + ingest = True + + if not self.qdrant.collection_exists(collection_name=self.index_name): + run_from_scratch = True + ingest = False + self.qdrant.create_collection( + collection_name=self.index_name, + vectors_config=VectorParams(size=1536, distance=Distance.COSINE), + ) + else: + if get_settings().pr_similar_issue.force_update_dataset: + ingest = True + else: + response = self.qdrant.count( + collection_name=self.index_name, + count_filter=Filter(must=[ + FieldCondition(key="metadata.repo", match=MatchValue(value=repo_name_for_index)), + FieldCondition(key="id", match=MatchValue(value=f"example_issue_{repo_name_for_index}")), + ]), + ) + ingest = True if response.count == 0 else False + + if run_from_scratch or ingest: + get_logger().info('Indexing the entire repo...') + get_logger().info('Getting issues...') + issues = list(repo_obj.get_issues(state='all')) + get_logger().info('Done') + self._update_qdrant_with_issues(issues, repo_name_for_index, ingest=ingest) + else: + issues_to_update = [] + issues_paginated_list = repo_obj.get_issues(state='all') + counter = 1 + for issue in issues_paginated_list: + if issue.pull_request: + continue + issue_str, comments, number = self._process_issue(issue) + issue_key = f"issue_{number}" + point_id = issue_key + "." + "issue" + response = self.qdrant.count( + collection_name=self.index_name, + count_filter=Filter(must=[ + FieldCondition(key="id", match=MatchValue(value=point_id)), + FieldCondition(key="metadata.repo", match=MatchValue(value=repo_name_for_index)), + ]), + ) + if response.count == 0: + counter += 1 + issues_to_update.append(issue) + else: + break + + if issues_to_update: + get_logger().info(f'Updating index with {counter} new issues...') + self._update_qdrant_with_issues(issues_to_update, repo_name_for_index, ingest=True) + else: + get_logger().info('No new issues to update') + + + async def run(self): + get_logger().info('Getting issue...') + repo_name, original_issue_number = self.git_provider._parse_issue_url(self.issue_url.split('=')[-1]) + issue_main = self.git_provider.repo_obj.get_issue(original_issue_number) + issue_str, comments, number = self._process_issue(issue_main) + openai.api_key = get_settings().openai.key + get_logger().info('Done') + + get_logger().info('Querying...') + res = openai.Embedding.create(input=[issue_str], engine=MODEL) + embeds = [record['embedding'] for record in res['data']] + + relevant_issues_number_list = [] + relevant_comment_number_list = [] + score_list = [] + + if get_settings().pr_similar_issue.vectordb == "pinecone": + pinecone_index = pinecone.Index(index_name=self.index_name) + res = pinecone_index.query(embeds[0], + top_k=5, + filter={"repo": self.repo_name_for_index}, + include_metadata=True).to_dict() + + for r in res['matches']: + # skip example issue + if 'example_issue_' in r["id"]: + continue + + try: + issue_number = int(r["id"].split('.')[0].split('_')[-1]) + except: + get_logger().debug(f"Failed to parse issue number from {r['id']}") + continue + + if original_issue_number == issue_number: + continue + if issue_number not in relevant_issues_number_list: + relevant_issues_number_list.append(issue_number) + if 'comment' in r["id"]: + relevant_comment_number_list.append(int(r["id"].split('.')[1].split('_')[-1])) + else: + relevant_comment_number_list.append(-1) + score_list.append(str("{:.2f}".format(r['score']))) + get_logger().info('Done') + + elif get_settings().pr_similar_issue.vectordb == "lancedb": + res = self.table.search(embeds[0]).where(f"metadata.repo='{self.repo_name_for_index}'", prefilter=True).to_list() + + for r in res: + # skip example issue + if 'example_issue_' in r["id"]: + continue + + try: + issue_number = int(r["id"].split('.')[0].split('_')[-1]) + except: + get_logger().debug(f"Failed to parse issue number from {r['id']}") + continue + + if original_issue_number == issue_number: + continue + if issue_number not in relevant_issues_number_list: + relevant_issues_number_list.append(issue_number) + + if 'comment' in r["id"]: + relevant_comment_number_list.append(int(r["id"].split('.')[1].split('_')[-1])) + else: + relevant_comment_number_list.append(-1) + score_list.append(str("{:.2f}".format(1-r['_distance']))) + get_logger().info('Done') + + elif get_settings().pr_similar_issue.vectordb == "qdrant": + from qdrant_client.models import FieldCondition, Filter, MatchValue + res = self.qdrant.search( + collection_name=self.index_name, + query_vector=embeds[0], + limit=5, + query_filter=Filter(must=[FieldCondition(key="metadata.repo", match=MatchValue(value=self.repo_name_for_index))]), + with_payload=True, + ) + + for r in res: + rid = r.payload.get("id", "") + if 'example_issue_' in rid: + continue + try: + issue_number = int(rid.split('.')[0].split('_')[-1]) + except Exception: + get_logger().debug(f"Failed to parse issue number from {rid}") + continue + if original_issue_number == issue_number: + continue + if issue_number not in relevant_issues_number_list: + relevant_issues_number_list.append(issue_number) + if 'comment' in rid: + relevant_comment_number_list.append(int(rid.split('.')[1].split('_')[-1])) + else: + relevant_comment_number_list.append(-1) + score_list.append(str("{:.2f}".format(r.score))) + get_logger().info('Done') + + get_logger().info('Publishing response...') + similar_issues_str = "### Similar Issues\n___\n\n" + + for i, issue_number_similar in enumerate(relevant_issues_number_list): + issue = self.git_provider.repo_obj.get_issue(issue_number_similar) + title = issue.title + url = issue.html_url + if relevant_comment_number_list[i] != -1: + url = list(issue.get_comments())[relevant_comment_number_list[i]].html_url + similar_issues_str += f"{i + 1}. **[{title}]({url})** (score={score_list[i]})\n\n" + if get_settings().config.publish_output: + response = issue_main.create_comment(similar_issues_str) + get_logger().info(similar_issues_str) + get_logger().info('Done') + + def _process_issue(self, issue): + header = issue.title + body = issue.body + number = issue.number + if get_settings().pr_similar_issue.skip_comments: + comments = [] + else: + comments = list(issue.get_comments()) + issue_str = f"Issue Header: \"{header}\"\n\nIssue Body:\n{body}" + return issue_str, comments, number + + def _update_index_with_issues(self, issues_list, repo_name_for_index, upsert=False): + get_logger().info('Processing issues...') + corpus = Corpus() + example_issue_record = Record( + id=f"example_issue_{repo_name_for_index}", + text="example_issue", + metadata=Metadata(repo=repo_name_for_index) + ) + corpus.append(example_issue_record) + + counter = 0 + for issue in issues_list: + if issue.pull_request: + continue + + counter += 1 + if counter % 100 == 0: + get_logger().info(f"Scanned {counter} issues") + if counter >= self.max_issues_to_scan: + get_logger().info(f"Scanned {self.max_issues_to_scan} issues, stopping") + break + + issue_str, comments, number = self._process_issue(issue) + issue_key = f"issue_{number}" + username = issue.user.login + created_at = str(issue.created_at) + if len(issue_str) < 8000 or \ + self.token_handler.count_tokens(issue_str) < get_max_tokens(MODEL): # fast reject first + issue_record = Record( + id=issue_key + "." + "issue", + text=issue_str, + metadata=Metadata(repo=repo_name_for_index, + username=username, + created_at=created_at, + level=IssueLevel.ISSUE) + ) + corpus.append(issue_record) + if comments: + for j, comment in enumerate(comments): + comment_body = comment.body + num_words_comment = len(comment_body.split()) + if num_words_comment < 10 or not isinstance(comment_body, str): + continue + + if len(comment_body) < 8000 or \ + self.token_handler.count_tokens(comment_body) < MAX_TOKENS[MODEL]: + comment_record = Record( + id=issue_key + ".comment_" + str(j + 1), + text=comment_body, + metadata=Metadata(repo=repo_name_for_index, + username=username, # use issue username for all comments + created_at=created_at, + level=IssueLevel.COMMENT) + ) + corpus.append(comment_record) + df = pd.DataFrame(corpus.dict()["documents"]) + get_logger().info('Done') + + get_logger().info('Embedding...') + openai.api_key = get_settings().openai.key + list_to_encode = list(df["text"].values) + try: + res = openai.Embedding.create(input=list_to_encode, engine=MODEL) + embeds = [record['embedding'] for record in res['data']] + except: + embeds = [] + get_logger().error('Failed to embed entire list, embedding one by one...') + for i, text in enumerate(list_to_encode): + try: + res = openai.Embedding.create(input=[text], engine=MODEL) + embeds.append(res['data'][0]['embedding']) + except: + embeds.append([0] * 1536) + df["values"] = embeds + meta = DatasetMetadata.empty() + meta.dense_model.dimension = len(embeds[0]) + ds = Dataset.from_pandas(df, meta) + get_logger().info('Done') + + api_key = get_settings().pinecone.api_key + environment = get_settings().pinecone.environment + if not upsert: + get_logger().info('Creating index from scratch...') + ds.to_pinecone_index(self.index_name, api_key=api_key, environment=environment) + time.sleep(15) # wait for pinecone to finalize indexing before querying + else: + get_logger().info('Upserting index...') + namespace = "" + batch_size: int = 100 + concurrency: int = 10 + pinecone.init(api_key=api_key, environment=environment) + ds._upsert_to_index(self.index_name, namespace, batch_size, concurrency) + time.sleep(5) # wait for pinecone to finalize upserting before querying + get_logger().info('Done') + + def _update_table_with_issues(self, issues_list, repo_name_for_index, ingest=False): + get_logger().info('Processing issues...') + + corpus = Corpus() + example_issue_record = Record( + id=f"example_issue_{repo_name_for_index}", + text="example_issue", + metadata=Metadata(repo=repo_name_for_index) + ) + corpus.append(example_issue_record) + + counter = 0 + for issue in issues_list: + if issue.pull_request: + continue + + counter += 1 + if counter % 100 == 0: + get_logger().info(f"Scanned {counter} issues") + if counter >= self.max_issues_to_scan: + get_logger().info(f"Scanned {self.max_issues_to_scan} issues, stopping") + break + + issue_str, comments, number = self._process_issue(issue) + issue_key = f"issue_{number}" + username = issue.user.login + created_at = str(issue.created_at) + if len(issue_str) < 8000 or \ + self.token_handler.count_tokens(issue_str) < get_max_tokens(MODEL): # fast reject first + issue_record = Record( + id=issue_key + "." + "issue", + text=issue_str, + metadata=Metadata(repo=repo_name_for_index, + username=username, + created_at=created_at, + level=IssueLevel.ISSUE) + ) + corpus.append(issue_record) + if comments: + for j, comment in enumerate(comments): + comment_body = comment.body + num_words_comment = len(comment_body.split()) + if num_words_comment < 10 or not isinstance(comment_body, str): + continue + + if len(comment_body) < 8000 or \ + self.token_handler.count_tokens(comment_body) < MAX_TOKENS[MODEL]: + comment_record = Record( + id=issue_key + ".comment_" + str(j + 1), + text=comment_body, + metadata=Metadata(repo=repo_name_for_index, + username=username, # use issue username for all comments + created_at=created_at, + level=IssueLevel.COMMENT) + ) + corpus.append(comment_record) + df = pd.DataFrame(corpus.dict()["documents"]) + get_logger().info('Done') + + get_logger().info('Embedding...') + openai.api_key = get_settings().openai.key + list_to_encode = list(df["text"].values) + try: + res = openai.Embedding.create(input=list_to_encode, engine=MODEL) + embeds = [record['embedding'] for record in res['data']] + except: + embeds = [] + get_logger().error('Failed to embed entire list, embedding one by one...') + for i, text in enumerate(list_to_encode): + try: + res = openai.Embedding.create(input=[text], engine=MODEL) + embeds.append(res['data'][0]['embedding']) + except: + embeds.append([0] * 1536) + df["vector"] = embeds + get_logger().info('Done') + + if not ingest: + get_logger().info('Creating table from scratch...') + self.table = self.db.create_table(self.index_name, data=df, mode="overwrite") + time.sleep(15) + else: + get_logger().info('Ingesting in Table...') + if self.index_name not in self.db.table_names(): + self.table.add(df) + else: + get_logger().info(f"Table {self.index_name} doesn't exists!") + time.sleep(5) + get_logger().info('Done') + + + def _update_qdrant_with_issues(self, issues_list, repo_name_for_index, ingest=False): + try: + import uuid + + import pandas as pd + from qdrant_client.models import PointStruct + except Exception: + raise + + get_logger().info('Processing issues...') + corpus = Corpus() + example_issue_record = Record( + id=f"example_issue_{repo_name_for_index}", + text="example_issue", + metadata=Metadata(repo=repo_name_for_index) + ) + corpus.append(example_issue_record) + + counter = 0 + for issue in issues_list: + if issue.pull_request: + continue + + counter += 1 + if counter % 100 == 0: + get_logger().info(f"Scanned {counter} issues") + if counter >= self.max_issues_to_scan: + get_logger().info(f"Scanned {self.max_issues_to_scan} issues, stopping") + break + + issue_str, comments, number = self._process_issue(issue) + issue_key = f"issue_{number}" + username = issue.user.login + created_at = str(issue.created_at) + if len(issue_str) < 8000 or \ + self.token_handler.count_tokens(issue_str) < get_max_tokens(MODEL): + issue_record = Record( + id=issue_key + "." + "issue", + text=issue_str, + metadata=Metadata(repo=repo_name_for_index, + username=username, + created_at=created_at, + level=IssueLevel.ISSUE) + ) + corpus.append(issue_record) + if comments: + for j, comment in enumerate(comments): + comment_body = comment.body + num_words_comment = len(comment_body.split()) + if num_words_comment < 10 or not isinstance(comment_body, str): + continue + + if len(comment_body) < 8000 or \ + self.token_handler.count_tokens(comment_body) < MAX_TOKENS[MODEL]: + comment_record = Record( + id=issue_key + ".comment_" + str(j + 1), + text=comment_body, + metadata=Metadata(repo=repo_name_for_index, + username=username, + created_at=created_at, + level=IssueLevel.COMMENT) + ) + corpus.append(comment_record) + + df = pd.DataFrame(corpus.dict()["documents"]) + get_logger().info('Done') + + get_logger().info('Embedding...') + openai.api_key = get_settings().openai.key + list_to_encode = list(df["text"].values) + try: + res = openai.Embedding.create(input=list_to_encode, engine=MODEL) + embeds = [record['embedding'] for record in res['data']] + except Exception: + embeds = [] + get_logger().error('Failed to embed entire list, embedding one by one...') + for i, text in enumerate(list_to_encode): + try: + res = openai.Embedding.create(input=[text], engine=MODEL) + embeds.append(res['data'][0]['embedding']) + except Exception: + embeds.append([0] * 1536) + df["vector"] = embeds + get_logger().info('Done') + + get_logger().info('Upserting into Qdrant...') + points = [] + for row in df.to_dict(orient="records"): + points.append( + PointStruct(id=uuid.uuid5(uuid.NAMESPACE_DNS, row["id"]).hex, vector=row["vector"], payload={"id": row["id"], "text": row["text"], "metadata": row["metadata"]}) + ) + self.qdrant.upsert(collection_name=self.index_name, points=points) + get_logger().info('Done') + + +class IssueLevel(str, Enum): + ISSUE = "issue" + COMMENT = "comment" + + +class Metadata(BaseModel): + repo: str + username: str = Field(default="@codium") + created_at: str = Field(default="01-01-1970 00:00:00.00000") + level: IssueLevel = Field(default=IssueLevel.ISSUE) + + class Config: + use_enum_values = True + + +class Record(BaseModel): + id: str + text: str + metadata: Metadata + + +class Corpus(BaseModel): + documents: List[Record] = Field(default=[]) + + def append(self, r: Record): + self.documents.append(r) diff --git a/requirements.txt b/requirements.txt index d303ea34..4d42e467 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,11 +32,7 @@ pytest-cov==5.0.0 pydantic==2.8.2 html2text==2024.2.26 giteapy==1.0.8 -# Uncomment the following lines to enable the 'similar issue' tool -# pinecone-client -# pinecone-datasets @ git+https://github.com/mrT23/pinecone-datasets.git@main -# lancedb==0.5.1 -# qdrant-client==1.15.1 +rapidfuzz>=3.0.0 # For fuzzy text matching in similar_issue tool # uncomment this to support language LangChainOpenAIHandler # langchain==0.2.0 # langchain-core==0.2.28