Tr/updates23 (#2008)

* fix: improve PR description field guidance for clarity

* feat: refine suggestion guidelines to avoid redundant recommendations in PR reviews

* feat: enhance YAML parsing logic with additional keys and fallback strategies

* fix: update expected output format in YAML parsing test case
This commit is contained in:
Tal 2025-08-22 10:16:08 +03:00 committed by GitHub
parent 5fc466bfbc
commit dae9683770
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 49 additions and 22 deletions

View file

@ -772,7 +772,8 @@ def try_fix_yaml(response_text: str,
response_text_original="") -> dict: response_text_original="") -> dict:
response_text_lines = response_text.split('\n') response_text_lines = response_text.split('\n')
keys_yaml = ['relevant line:', 'suggestion content:', 'relevant file:', 'existing code:', 'improved code:', 'label:'] keys_yaml = ['relevant line:', 'suggestion content:', 'relevant file:', 'existing code:',
'improved code:', 'label:', 'why:', 'suggestion_summary:']
keys_yaml = keys_yaml + keys_fix_yaml keys_yaml = keys_yaml + keys_fix_yaml
# first fallback - try to convert 'relevant line: ...' to relevant line: |-\n ...' # first fallback - try to convert 'relevant line: ...' to relevant line: |-\n ...'
@ -847,6 +848,7 @@ def try_fix_yaml(response_text: str,
if index_end == -1: if index_end == -1:
index_end = len(response_text) index_end = len(response_text)
response_text_copy = response_text[index_start:index_end].strip().strip('```yaml').strip('`').strip() response_text_copy = response_text[index_start:index_end].strip().strip('```yaml').strip('`').strip()
if response_text_copy:
try: try:
data = yaml.safe_load(response_text_copy) data = yaml.safe_load(response_text_copy)
get_logger().info(f"Successfully parsed AI prediction after extracting yaml snippet") get_logger().info(f"Successfully parsed AI prediction after extracting yaml snippet")
@ -881,14 +883,18 @@ def try_fix_yaml(response_text: str,
response_text_copy = copy.deepcopy(response_text) response_text_copy = copy.deepcopy(response_text)
response_text_copy_lines = response_text_copy.split('\n') response_text_copy_lines = response_text_copy.split('\n')
start_line = -1 start_line = -1
improve_sections = ['existing_code:', 'improved_code:', 'response:', 'why:']
describe_sections = ['description:', 'title:', 'changes_diagram:', 'pr_files:', 'pr_ticket:']
for i, line in enumerate(response_text_copy_lines): for i, line in enumerate(response_text_copy_lines):
if 'existing_code:' in line or 'improved_code:' in line: line_stripped = line.rstrip()
if any(key in line_stripped for key in (improve_sections+describe_sections)):
start_line = i start_line = i
elif line.endswith(': |') or line.endswith(': |-') or line.endswith(': |2') or line.endswith(':'): elif line_stripped.endswith(': |') or line_stripped.endswith(': |-') or line_stripped.endswith(': |2') or any(line_stripped.endswith(key) for key in keys_yaml):
start_line = -1 start_line = -1
elif start_line != -1: elif start_line != -1:
response_text_copy_lines[i] = ' ' + line response_text_copy_lines[i] = ' ' + line
response_text_copy = '\n'.join(response_text_copy_lines) response_text_copy = '\n'.join(response_text_copy_lines)
response_text_copy = response_text_copy.replace(' |\n', ' |2\n')
try: try:
data = yaml.safe_load(response_text_copy) data = yaml.safe_load(response_text_copy)
get_logger().info(f"Successfully parsed AI prediction after adding indent for sections of code blocks") get_logger().info(f"Successfully parsed AI prediction after adding indent for sections of code blocks")
@ -896,6 +902,27 @@ def try_fix_yaml(response_text: str,
except: except:
pass pass
# eighth fallback - try to remove pipe chars at the root-level dicts
response_text_copy = copy.deepcopy(response_text)
response_text_copy = response_text_copy.lstrip('|\n')
try:
data = yaml.safe_load(response_text_copy)
get_logger().info(f"Successfully parsed AI prediction after removing pipe chars")
return data
except:
pass
# ninth fallback - try to decode the response text with different encodings. GPT-5 can return text that is not utf-8 encoded.
encodings_to_try = ['latin-1', 'utf-16']
for encoding in encodings_to_try:
try:
data = yaml.safe_load(response_text.encode(encoding).decode("utf-8"))
if data:
get_logger().info(f"Successfully parsed AI prediction after decoding with {encoding} encoding")
return data
except:
pass
# # sixth fallback - try to remove last lines # # sixth fallback - try to remove last lines
# for i in range(1, len(response_text_lines)): # for i in range(1, len(response_text_lines)):
# response_text_lines_tmp = '\n'.join(response_text_lines[:-i]) # response_text_lines_tmp = '\n'.join(response_text_lines[:-i])

View file

@ -63,10 +63,15 @@ Specific guidelines for generating code suggestions:
- Don't suggest to add docstring, type hints, or comments, to remove unused imports, or to use more specific exception types. - Don't suggest to add docstring, type hints, or comments, to remove unused imports, or to use more specific exception types.
{%- else %} {%- else %}
- Only give suggestions that address critical problems and bugs in the PR code. If no relevant suggestions are applicable, return an empty list. - Only give suggestions that address critical problems and bugs in the PR code. If no relevant suggestions are applicable, return an empty list.
- Do not suggest to change packages version, add missing import statement, or declare undefined variable. - DO NOT suggest the following:
- change packages version
- add missing import statement
- declare undefined variable, or remove unused variable
- use more specific exception types
- repeat changes already done in the PR code
{%- endif %} {%- endif %}
- Be aware that your input consists only of partial code segments (PR diff code), not the complete codebase. Therefore, avoid making suggestions that might duplicate existing functionality, and refrain from questioning code elements (such as variable declarations or import statements) that may be defined elsewhere in the codebase.
- When mentioning code elements (variables, names, or files) in your response, surround them with backticks (`). For example: "verify that `user_id` is..." - When mentioning code elements (variables, names, or files) in your response, surround them with backticks (`). For example: "verify that `user_id` is..."
- Note that you only see changed code segments (diff hunks in a PR), not the entire codebase. Avoid suggestions that might duplicate existing functionality or questioning code elements (like variables declarations or import statements) that may be defined elsewhere in the codebase.
{%- if extra_instructions %} {%- if extra_instructions %}

View file

@ -45,7 +45,7 @@ class FileDescription(BaseModel):
class PRDescription(BaseModel): class PRDescription(BaseModel):
type: List[PRType] = Field(description="one or more types that describe the PR content. Return the label member value (e.g. 'Bug fix', not 'bug_fix')") type: List[PRType] = Field(description="one or more types that describe the PR content. Return the label member value (e.g. 'Bug fix', not 'bug_fix')")
description: str = Field(description="summarize the PR changes in up to four bullet points, each up to 8 words. For large PRs, add sub-bullets if needed. Order bullets by importance, with each bullet highlighting a key change group.") description: str = Field(description="summarize the PR changes with 1-4 bullet points, each up to 8 words. For large PRs, add sub-bullets for each bullet if needed. Order bullets by importance, with each bullet highlighting a key change group.")
title: str = Field(description="a concise and descriptive title that captures the PR's main theme") title: str = Field(description="a concise and descriptive title that captures the PR's main theme")
{%- if enable_pr_diagram %} {%- if enable_pr_diagram %}
changes_diagram: str = Field(description='a horizontal diagram that represents the main PR changes, in the format of a valid mermaid LR flowchart. The diagram should be concise and easy to read. Leave empty if no diagram is relevant. To create robust Mermaid diagrams, follow this two-step process: (1) Declare the nodes: nodeID["node description"]. (2) Then define the links: nodeID1 -- "link text" --> nodeID2. Node description must always be surrounded with double quotation marks') changes_diagram: str = Field(description='a horizontal diagram that represents the main PR changes, in the format of a valid mermaid LR flowchart. The diagram should be concise and easy to read. Leave empty if no diagram is relevant. To create robust Mermaid diagrams, follow this two-step process: (1) Declare the nodes: nodeID["node description"]. (2) Then define the links: nodeID1 -- "link text" --> nodeID2. Node description must always be surrounded with double quotation marks')
@ -63,7 +63,8 @@ type:
- ... - ...
- ... - ...
description: | description: |
... - ...
- ...
title: | title: |
... ...
{%- if enable_pr_diagram %} {%- if enable_pr_diagram %}
@ -151,7 +152,8 @@ type:
- Refactoring - Refactoring
- ... - ...
description: | description: |
... - ...
- ...
title: | title: |
... ...
{%- if enable_pr_diagram %} {%- if enable_pr_diagram %}

View file

@ -242,12 +242,5 @@ int sub(int a, int b) {
return a - b; return a - b;
} }
''' '''
expected_output = { expected_output = {'code_suggestions': [{'relevant_file': 'a.c\n', 'existing_code': ' int sum(int a, int b) {\n return a + b;\n }\n\n int sub(int a, int b) {\n return a - b;\n }\n'}]}
"code_suggestions": [
{
"relevant_file": "a.c\n",
"existing_code": expected_code_block
}
]
}
assert try_fix_yaml(review_text, first_key='code_suggestions', last_key='existing_code') == expected_output assert try_fix_yaml(review_text, first_key='code_suggestions', last_key='existing_code') == expected_output