Merge pull request #1091 from Codium-ai/tr/patch_improvements

patch and prompt improvements
Codium-ai · Aug 4, 2024 · 2b77d07 · 2b77d07
2 parents 85cc0ad + ee1676c
commit 2b77d07
Show file tree

Hide file tree

Showing 5 changed files with 46 additions and 40 deletions.
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.10 AS base
+FROM python:3.12.3 AS base
 
 WORKDIR /app
 ADD pyproject.toml .

diff --git a/pr_agent/algo/git_patch_processing.py b/pr_agent/algo/git_patch_processing.py
@@ -183,7 +183,6 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
         line6
            ...
     """
-
     patch_with_lines_str = f"\n\n## file: '{file.filename.strip()}'\n"
     patch_lines = patch.splitlines()
     RE_HUNK_HEADER = re.compile(
@@ -193,25 +192,29 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
     match = None
     start1, size1, start2, size2 = -1, -1, -1, -1
     prev_header_line = []
-    header_line =[]
+    header_line = []
     for line in patch_lines:
         if 'no newline at end of file' in line.lower():
             continue
 
         if line.startswith('@@'):
             header_line = line
             match = RE_HUNK_HEADER.match(line)
-            if match and new_content_lines:  # found a new hunk, split the previous lines
+            if match and (new_content_lines or old_content_lines):  # found a new hunk, split the previous lines
+                if prev_header_line:
+                    patch_with_lines_str += f'\n{prev_header_line}\n'
                 if new_content_lines:
-                    if prev_header_line:
-                        patch_with_lines_str += f'\n{prev_header_line}\n'
-                    patch_with_lines_str = patch_with_lines_str.rstrip()+'\n__new hunk__\n'
-                    for i, line_new in enumerate(new_content_lines):
-                        patch_with_lines_str += f"{start2 + i} {line_new}\n"
+                    is_plus_lines = any([line.startswith('+') for line in new_content_lines])
+                    if is_plus_lines:
+                        patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__new hunk__\n'
+                        for i, line_new in enumerate(new_content_lines):
+                            patch_with_lines_str += f"{start2 + i} {line_new}\n"
                 if old_content_lines:
-                    patch_with_lines_str = patch_with_lines_str.rstrip()+'\n__old hunk__\n'
-                    for line_old in old_content_lines:
-                        patch_with_lines_str += f"{line_old}\n"
+                    is_minus_lines = any([line.startswith('-') for line in old_content_lines])
+                    if is_minus_lines:
+                        patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__old hunk__\n'
+                        for line_old in old_content_lines:
+                            patch_with_lines_str += f"{line_old}\n"
                 new_content_lines = []
                 old_content_lines = []
             if match:
@@ -223,7 +226,7 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
                     res[i] = 0
             try:
                 start1, size1, start2, size2 = map(int, res[:4])
-            except: # '@@ -0,0 +1 @@' case
+            except:  # '@@ -0,0 +1 @@' case
                 start1, size1, size2 = map(int, res[:3])
                 start2 = 0
 
@@ -237,15 +240,19 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
 
     # finishing last hunk
     if match and new_content_lines:
+        patch_with_lines_str += f'\n{header_line}\n'
         if new_content_lines:
-            patch_with_lines_str += f'\n{header_line}\n'
-            patch_with_lines_str = patch_with_lines_str.rstrip()+ '\n__new hunk__\n'
-            for i, line_new in enumerate(new_content_lines):
-                patch_with_lines_str += f"{start2 + i} {line_new}\n"
+            is_plus_lines = any([line.startswith('+') for line in new_content_lines])
+            if is_plus_lines:
+                patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__new hunk__\n'
+                for i, line_new in enumerate(new_content_lines):
+                    patch_with_lines_str += f"{start2 + i} {line_new}\n"
         if old_content_lines:
-            patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__old hunk__\n'
-            for line_old in old_content_lines:
-                patch_with_lines_str += f"{line_old}\n"
+            is_minus_lines = any([line.startswith('-') for line in old_content_lines])
+            if is_minus_lines:
+                patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__old hunk__\n'
+                for line_old in old_content_lines:
+                    patch_with_lines_str += f"{line_old}\n"
 
     return patch_with_lines_str.rstrip()
 

diff --git a/pr_agent/settings/pr_code_suggestions_prompts.toml b/pr_agent/settings/pr_code_suggestions_prompts.toml
@@ -10,11 +10,11 @@ The format we will use to present the PR code diff:
 @@ ... @@ def func1():
 __new hunk__
 12  code line1 that remained unchanged in the PR
-13 +new hunk code line2 added in the PR
+13 +new code line2 added in the PR
 14  code line3 that remained unchanged in the PR
 __old hunk__
  code line1 that remained unchanged in the PR
--old hunk code line2 that was removed in the PR
+-old code line2 that was removed in the PR
  code line3 that remained unchanged in the PR
 
 @@ ... @@ def func2():
@@ -27,7 +27,7 @@ __old hunk__
 ## file: 'src/file2.py'
 ...
 ======
-- In this format, we separated each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed.
+- In this format, we separated each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed. If no new code was added in a specific hunk, '__new hunk__' section will not be presented. If no code was removed, '__old hunk__' section will not be presented.
 - We also added line numbers for the '__new hunk__' sections, to help you refer to the code lines in your suggestions. These line numbers are not part of the actual code, and are only used for reference.
 - Code lines are prefixed with symbols ('+', '-', ' '). The '+' symbol indicates new code added in the PR, the '-' symbol indicates code removed in the PR, and the ' ' symbol indicates unchanged code. \
 Suggestions should always focus on ways to improve the new code lines introduced in the PR, meaning lines in the '__new hunk__' sections that begin with a '+' symbol (after the line numbers). The '__old hunk__' sections code is for context and reference only.
@@ -57,10 +57,10 @@ Extra instructions from the user, that should be taken into account with high pr
 The output must be a YAML object equivalent to type $PRCodeSuggestions, according to the following Pydantic definitions:
 =====
 class CodeSuggestion(BaseModel):
-    relevant_file: str = Field(description="The full file path of the relevant file.")
-    language: str = Field(description="The programming language of the relevant file.")
+    relevant_file: str = Field(description="The full file path of the relevant file")
+    language: str = Field(description="The programming language of the relevant file")
     suggestion_content: str = Field(description="an actionable suggestion for meaningfully improving the new code introduced in the PR")
-    existing_code: str = Field(description="a short code snippet, demonstrating the relevant code lines from a '__new hunk__' section. It must be without line numbers. Use abbreviations if needed")
+    existing_code: str = Field(description="a short code snippet, demonstrating the relevant code lines from a '__new hunk__' section. It must be without line numbers. Quote only full code lines, not partial ones. Use abbreviations ("...") of full lines if needed")
     improved_code: str = Field(description="a new code snippet, that can be used to replace the relevant 'existing_code' lines in '__new hunk__' code after applying the suggestion")
     one_sentence_summary: str = Field(description="a short summary of the suggestion action, in a single sentence. Focus on the 'what'. Be general, and avoid method or variable names.")
     relevant_lines_start: int = Field(description="The relevant line number, from a '__new hunk__' section, where the suggestion starts (inclusive). Should be derived from the hunk line numbers, and correspond to the 'existing code' snippet above")
@@ -125,11 +125,11 @@ The format we will use to present the PR code diff:
 @@ ... @@ def func1():
 __new hunk__
 12  code line1 that remained unchanged in the PR
-13 +new hunk code line2 added in the PR
+13 +new code line2 added in the PR
 14  code line3 that remained unchanged in the PR
 __old hunk__
  code line1 that remained unchanged in the PR
--old hunk code line2 that was removed in the PR
+-old code line2 that was removed in the PR
  code line3 that remained unchanged in the PR
 
 @@ ... @@ def func2():
@@ -142,12 +142,11 @@ __old hunk__
 ## file: 'src/file2.py'
 ...
 ======
-- In this format, we separated each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed.
+- In this format, we separated each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed. If no new code was added in a specific hunk, '__new hunk__' section will not be presented. If no code was removed, '__old hunk__' section will not be presented.
 - We also added line numbers for the '__new hunk__' sections, to help you refer to the code lines in your suggestions. These line numbers are not part of the actual code, and are only used for reference.
 - Code lines are prefixed with symbols ('+', '-', ' '). The '+' symbol indicates new code added in the PR, the '-' symbol indicates code removed in the PR, and the ' ' symbol indicates unchanged code. \
 Suggestions should always focus on ways to improve the new code lines introduced in the PR, meaning lines in the '__new hunk__' sections that begin with a '+' symbol (after the line numbers). The '__old hunk__' sections code is for context and reference only.
 
-
 Specific instructions for generating code suggestions:
 - Provide in total up to {{ num_code_suggestions }} code suggestions. The suggestions should be diverse and insightful.
 - The suggestions should focus on improving the new code introduced the PR, meaning lines from '__new hunk__' sections, starting with '+' (after the line numbers).
@@ -171,10 +170,10 @@ Extra instructions from the user, that should be taken into account with high pr
 The output must be a YAML object equivalent to type $PRCodeSuggestions, according to the following Pydantic definitions:
 =====
 class CodeSuggestion(BaseModel):
-    relevant_file: str = Field(description="The full file path of the relevant file.")
-    language: str = Field(description="The programming language of the relevant file.")
+    relevant_file: str = Field(description="The full file path of the relevant file")
+    language: str = Field(description="the programming language of the relevant file")
     suggestion_content: str = Field(description="an actionable suggestion for meaningfully improving the new code introduced in the PR. Don't present here actual code snippets, just the suggestion. Be short and concise ")
-    existing_code: str = Field(description="a short code snippet, demonstrating the relevant code lines from a '__new hunk__' section. It must be without line numbers. Use abbreviations ("...") if needed")
+    existing_code: str = Field(description="a short code snippet, demonstrating the relevant code lines from a '__new hunk__' section. It must be without line numbers. Quote only full code lines, not partial ones. Use abbreviations ("...") of full lines if needed")
     improved_code: str = Field(description="a new code snippet, that can be used to replace the relevant 'existing_code' lines in '__new hunk__' code after applying the suggestion")
     one_sentence_summary: str = Field(description="a short summary of the suggestion action, in a single sentence. Focus on the 'what'. Be general, and avoid method or variable names.")
     relevant_lines_start: int = Field(description="The relevant line number, from a '__new hunk__' section, where the suggestion starts (inclusive). Should be derived from the hunk line numbers, and correspond to the 'existing code' snippet above")
@@ -210,4 +209,4 @@ code_suggestions:
 
 
 Each YAML output MUST be after a newline, indented, with block scalar indicator ('|').
-"""
+"""
diff --git a/pr_agent/settings/pr_code_suggestions_reflect_prompts.toml b/pr_agent/settings/pr_code_suggestions_reflect_prompts.toml
@@ -21,11 +21,11 @@ The format that is used to present the PR code diff is as follows:
 @@ ... @@ def func1():
 __new hunk__
 12  code line1 that remained unchanged in the PR
-13 +new hunk code line2 added in the PR
+13 +new code line2 added in the PR
 14  code line3 that remained unchanged in the PR
 __old hunk__
  code line1 that remained unchanged in the PR
--old hunk code line2 that was removed in the PR
+-old code line2 that was removed in the PR
  code line3 that remained unchanged in the PR
 
 @@ ... @@ def func2():
@@ -39,11 +39,11 @@ __old hunk__
 ...
 ======
 - In this format, we separated each hunk of code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code that was removed.
+- If no new code was added in a specific hunk, '__new hunk__' section will not be presented. If no code was removed, '__old hunk__' section will not be presented.
 - We added line numbers for the '__new hunk__' sections, to help you refer to the code lines in your suggestions. These line numbers are not part of the actual code, and are only used for reference.
 - Code lines are prefixed symbols ('+', '-', ' '). The '+' symbol indicates new code added in the PR, the '-' symbol indicates code removed in the PR, and the ' ' symbol indicates unchanged code.
 
 
-
 The output must be a YAML object equivalent to type $PRCodeSuggestionsFeedback, according to the following Pydantic definitions:
 =====
 class CodeSuggestionFeedback(BaseModel):

diff --git a/pr_agent/settings/pr_reviewer_prompts.toml b/pr_agent/settings/pr_reviewer_prompts.toml
@@ -15,11 +15,11 @@ The format we will use to present the PR code diff:
 @@ ... @@ def func1():
 __new hunk__
 12  code line1 that remained unchanged in the PR
-13 +new hunk code line2 added in the PR
+13 +new code line2 added in the PR
 14  code line3 that remained unchanged in the PR
 __old hunk__
  code line1 that remained unchanged in the PR
--old hunk code line2 that was removed in the PR
+-old code line2 that was removed in the PR
  code line3 that remained unchanged in the PR
 
 @@ ... @@ def func2():
@@ -32,7 +32,7 @@ __old hunk__
 ## file: 'src/file2.py'
 ...
 ======
-- In this format, we separated each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed.
+- In this format, we separated each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed. If no new code was added in a specific hunk, '__new hunk__' section will not be presented. If no code was removed, '__old hunk__' section will not be presented.
 - We also added line numbers for the '__new hunk__' sections, to help you refer to the code lines in your suggestions. These line numbers are not part of the actual code, and are only used for reference.
 - Code lines are prefixed with symbols ('+', '-', ' '). The '+' symbol indicates new code added in the PR, the '-' symbol indicates code removed in the PR, and the ' ' symbol indicates unchanged code. \
 The review should focus on new code added in the PR diff (lines starting with '+')