Skip to content

Commit

Permalink
Merge pull request #1091 from Codium-ai/tr/patch_improvements
Browse files Browse the repository at this point in the history
patch and prompt improvements
  • Loading branch information
mrT23 authored Aug 4, 2024
2 parents 85cc0ad + ee1676c commit 2b77d07
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 40 deletions.
2 changes: 1 addition & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.10 AS base
FROM python:3.12.3 AS base

WORKDIR /app
ADD pyproject.toml .
Expand Down
45 changes: 26 additions & 19 deletions pr_agent/algo/git_patch_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
line6
...
"""

patch_with_lines_str = f"\n\n## file: '{file.filename.strip()}'\n"
patch_lines = patch.splitlines()
RE_HUNK_HEADER = re.compile(
Expand All @@ -193,25 +192,29 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
match = None
start1, size1, start2, size2 = -1, -1, -1, -1
prev_header_line = []
header_line =[]
header_line = []
for line in patch_lines:
if 'no newline at end of file' in line.lower():
continue

if line.startswith('@@'):
header_line = line
match = RE_HUNK_HEADER.match(line)
if match and new_content_lines: # found a new hunk, split the previous lines
if match and (new_content_lines or old_content_lines): # found a new hunk, split the previous lines
if prev_header_line:
patch_with_lines_str += f'\n{prev_header_line}\n'
if new_content_lines:
if prev_header_line:
patch_with_lines_str += f'\n{prev_header_line}\n'
patch_with_lines_str = patch_with_lines_str.rstrip()+'\n__new hunk__\n'
for i, line_new in enumerate(new_content_lines):
patch_with_lines_str += f"{start2 + i} {line_new}\n"
is_plus_lines = any([line.startswith('+') for line in new_content_lines])
if is_plus_lines:
patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__new hunk__\n'
for i, line_new in enumerate(new_content_lines):
patch_with_lines_str += f"{start2 + i} {line_new}\n"
if old_content_lines:
patch_with_lines_str = patch_with_lines_str.rstrip()+'\n__old hunk__\n'
for line_old in old_content_lines:
patch_with_lines_str += f"{line_old}\n"
is_minus_lines = any([line.startswith('-') for line in old_content_lines])
if is_minus_lines:
patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__old hunk__\n'
for line_old in old_content_lines:
patch_with_lines_str += f"{line_old}\n"
new_content_lines = []
old_content_lines = []
if match:
Expand All @@ -223,7 +226,7 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
res[i] = 0
try:
start1, size1, start2, size2 = map(int, res[:4])
except: # '@@ -0,0 +1 @@' case
except: # '@@ -0,0 +1 @@' case
start1, size1, size2 = map(int, res[:3])
start2 = 0

Expand All @@ -237,15 +240,19 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:

# finishing last hunk
if match and new_content_lines:
patch_with_lines_str += f'\n{header_line}\n'
if new_content_lines:
patch_with_lines_str += f'\n{header_line}\n'
patch_with_lines_str = patch_with_lines_str.rstrip()+ '\n__new hunk__\n'
for i, line_new in enumerate(new_content_lines):
patch_with_lines_str += f"{start2 + i} {line_new}\n"
is_plus_lines = any([line.startswith('+') for line in new_content_lines])
if is_plus_lines:
patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__new hunk__\n'
for i, line_new in enumerate(new_content_lines):
patch_with_lines_str += f"{start2 + i} {line_new}\n"
if old_content_lines:
patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__old hunk__\n'
for line_old in old_content_lines:
patch_with_lines_str += f"{line_old}\n"
is_minus_lines = any([line.startswith('-') for line in old_content_lines])
if is_minus_lines:
patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__old hunk__\n'
for line_old in old_content_lines:
patch_with_lines_str += f"{line_old}\n"

return patch_with_lines_str.rstrip()

Expand Down
27 changes: 13 additions & 14 deletions pr_agent/settings/pr_code_suggestions_prompts.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ The format we will use to present the PR code diff:
@@ ... @@ def func1():
__new hunk__
12 code line1 that remained unchanged in the PR
13 +new hunk code line2 added in the PR
13 +new code line2 added in the PR
14 code line3 that remained unchanged in the PR
__old hunk__
code line1 that remained unchanged in the PR
-old hunk code line2 that was removed in the PR
-old code line2 that was removed in the PR
code line3 that remained unchanged in the PR
@@ ... @@ def func2():
Expand All @@ -27,7 +27,7 @@ __old hunk__
## file: 'src/file2.py'
...
======
- In this format, we separated each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed.
- In this format, we separated each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed. If no new code was added in a specific hunk, '__new hunk__' section will not be presented. If no code was removed, '__old hunk__' section will not be presented.
- We also added line numbers for the '__new hunk__' sections, to help you refer to the code lines in your suggestions. These line numbers are not part of the actual code, and are only used for reference.
- Code lines are prefixed with symbols ('+', '-', ' '). The '+' symbol indicates new code added in the PR, the '-' symbol indicates code removed in the PR, and the ' ' symbol indicates unchanged code. \
Suggestions should always focus on ways to improve the new code lines introduced in the PR, meaning lines in the '__new hunk__' sections that begin with a '+' symbol (after the line numbers). The '__old hunk__' sections code is for context and reference only.
Expand Down Expand Up @@ -57,10 +57,10 @@ Extra instructions from the user, that should be taken into account with high pr
The output must be a YAML object equivalent to type $PRCodeSuggestions, according to the following Pydantic definitions:
=====
class CodeSuggestion(BaseModel):
relevant_file: str = Field(description="The full file path of the relevant file.")
language: str = Field(description="The programming language of the relevant file.")
relevant_file: str = Field(description="The full file path of the relevant file")
language: str = Field(description="The programming language of the relevant file")
suggestion_content: str = Field(description="an actionable suggestion for meaningfully improving the new code introduced in the PR")
existing_code: str = Field(description="a short code snippet, demonstrating the relevant code lines from a '__new hunk__' section. It must be without line numbers. Use abbreviations if needed")
existing_code: str = Field(description="a short code snippet, demonstrating the relevant code lines from a '__new hunk__' section. It must be without line numbers. Quote only full code lines, not partial ones. Use abbreviations ("...") of full lines if needed")
improved_code: str = Field(description="a new code snippet, that can be used to replace the relevant 'existing_code' lines in '__new hunk__' code after applying the suggestion")
one_sentence_summary: str = Field(description="a short summary of the suggestion action, in a single sentence. Focus on the 'what'. Be general, and avoid method or variable names.")
relevant_lines_start: int = Field(description="The relevant line number, from a '__new hunk__' section, where the suggestion starts (inclusive). Should be derived from the hunk line numbers, and correspond to the 'existing code' snippet above")
Expand Down Expand Up @@ -125,11 +125,11 @@ The format we will use to present the PR code diff:
@@ ... @@ def func1():
__new hunk__
12 code line1 that remained unchanged in the PR
13 +new hunk code line2 added in the PR
13 +new code line2 added in the PR
14 code line3 that remained unchanged in the PR
__old hunk__
code line1 that remained unchanged in the PR
-old hunk code line2 that was removed in the PR
-old code line2 that was removed in the PR
code line3 that remained unchanged in the PR
@@ ... @@ def func2():
Expand All @@ -142,12 +142,11 @@ __old hunk__
## file: 'src/file2.py'
...
======
- In this format, we separated each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed.
- In this format, we separated each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed. If no new code was added in a specific hunk, '__new hunk__' section will not be presented. If no code was removed, '__old hunk__' section will not be presented.
- We also added line numbers for the '__new hunk__' sections, to help you refer to the code lines in your suggestions. These line numbers are not part of the actual code, and are only used for reference.
- Code lines are prefixed with symbols ('+', '-', ' '). The '+' symbol indicates new code added in the PR, the '-' symbol indicates code removed in the PR, and the ' ' symbol indicates unchanged code. \
Suggestions should always focus on ways to improve the new code lines introduced in the PR, meaning lines in the '__new hunk__' sections that begin with a '+' symbol (after the line numbers). The '__old hunk__' sections code is for context and reference only.
Specific instructions for generating code suggestions:
- Provide in total up to {{ num_code_suggestions }} code suggestions. The suggestions should be diverse and insightful.
- The suggestions should focus on improving the new code introduced the PR, meaning lines from '__new hunk__' sections, starting with '+' (after the line numbers).
Expand All @@ -171,10 +170,10 @@ Extra instructions from the user, that should be taken into account with high pr
The output must be a YAML object equivalent to type $PRCodeSuggestions, according to the following Pydantic definitions:
=====
class CodeSuggestion(BaseModel):
relevant_file: str = Field(description="The full file path of the relevant file.")
language: str = Field(description="The programming language of the relevant file.")
relevant_file: str = Field(description="The full file path of the relevant file")
language: str = Field(description="the programming language of the relevant file")
suggestion_content: str = Field(description="an actionable suggestion for meaningfully improving the new code introduced in the PR. Don't present here actual code snippets, just the suggestion. Be short and concise ")
existing_code: str = Field(description="a short code snippet, demonstrating the relevant code lines from a '__new hunk__' section. It must be without line numbers. Use abbreviations ("...") if needed")
existing_code: str = Field(description="a short code snippet, demonstrating the relevant code lines from a '__new hunk__' section. It must be without line numbers. Quote only full code lines, not partial ones. Use abbreviations ("...") of full lines if needed")
improved_code: str = Field(description="a new code snippet, that can be used to replace the relevant 'existing_code' lines in '__new hunk__' code after applying the suggestion")
one_sentence_summary: str = Field(description="a short summary of the suggestion action, in a single sentence. Focus on the 'what'. Be general, and avoid method or variable names.")
relevant_lines_start: int = Field(description="The relevant line number, from a '__new hunk__' section, where the suggestion starts (inclusive). Should be derived from the hunk line numbers, and correspond to the 'existing code' snippet above")
Expand Down Expand Up @@ -210,4 +209,4 @@ code_suggestions:
Each YAML output MUST be after a newline, indented, with block scalar indicator ('|').
"""
"""
6 changes: 3 additions & 3 deletions pr_agent/settings/pr_code_suggestions_reflect_prompts.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ The format that is used to present the PR code diff is as follows:
@@ ... @@ def func1():
__new hunk__
12 code line1 that remained unchanged in the PR
13 +new hunk code line2 added in the PR
13 +new code line2 added in the PR
14 code line3 that remained unchanged in the PR
__old hunk__
code line1 that remained unchanged in the PR
-old hunk code line2 that was removed in the PR
-old code line2 that was removed in the PR
code line3 that remained unchanged in the PR
@@ ... @@ def func2():
Expand All @@ -39,11 +39,11 @@ __old hunk__
...
======
- In this format, we separated each hunk of code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code that was removed.
- If no new code was added in a specific hunk, '__new hunk__' section will not be presented. If no code was removed, '__old hunk__' section will not be presented.
- We added line numbers for the '__new hunk__' sections, to help you refer to the code lines in your suggestions. These line numbers are not part of the actual code, and are only used for reference.
- Code lines are prefixed symbols ('+', '-', ' '). The '+' symbol indicates new code added in the PR, the '-' symbol indicates code removed in the PR, and the ' ' symbol indicates unchanged code.
The output must be a YAML object equivalent to type $PRCodeSuggestionsFeedback, according to the following Pydantic definitions:
=====
class CodeSuggestionFeedback(BaseModel):
Expand Down
6 changes: 3 additions & 3 deletions pr_agent/settings/pr_reviewer_prompts.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ The format we will use to present the PR code diff:
@@ ... @@ def func1():
__new hunk__
12 code line1 that remained unchanged in the PR
13 +new hunk code line2 added in the PR
13 +new code line2 added in the PR
14 code line3 that remained unchanged in the PR
__old hunk__
code line1 that remained unchanged in the PR
-old hunk code line2 that was removed in the PR
-old code line2 that was removed in the PR
code line3 that remained unchanged in the PR
@@ ... @@ def func2():
Expand All @@ -32,7 +32,7 @@ __old hunk__
## file: 'src/file2.py'
...
======
- In this format, we separated each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed.
- In this format, we separated each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed. If no new code was added in a specific hunk, '__new hunk__' section will not be presented. If no code was removed, '__old hunk__' section will not be presented.
- We also added line numbers for the '__new hunk__' sections, to help you refer to the code lines in your suggestions. These line numbers are not part of the actual code, and are only used for reference.
- Code lines are prefixed with symbols ('+', '-', ' '). The '+' symbol indicates new code added in the PR, the '-' symbol indicates code removed in the PR, and the ' ' symbol indicates unchanged code. \
The review should focus on new code added in the PR diff (lines starting with '+')
Expand Down

0 comments on commit 2b77d07

Please sign in to comment.