From 4b05a3e85845af93365602e6b80455f6d51088d1 Mon Sep 17 00:00:00 2001
From: mrT23 <tal.r@codium.ai>
Date: Mon, 7 Oct 2024 20:32:11 +0300
Subject: [PATCH] refactor: streamline hunk processing logic in
 git_patch_processing.py

- Simplified logic for handling new and old hunks to ensure consistent presentation of changes.
- Updated documentation in TOML files to reflect changes in hunk section handling and line number references.
---
 pr_agent/algo/git_patch_processing.py         | 36 ++++++++++---------
 .../settings/pr_code_suggestions_prompts.toml | 11 +++---
 .../pr_code_suggestions_reflect_prompts.toml  |  8 +++++
 pr_agent/settings/pr_reviewer_prompts.toml    |  9 +++--
 4 files changed, 36 insertions(+), 28 deletions(-)

diff --git a/pr_agent/algo/git_patch_processing.py b/pr_agent/algo/git_patch_processing.py
index 180d7489e..f0540ad0a 100644
--- a/pr_agent/algo/git_patch_processing.py
+++ b/pr_agent/algo/git_patch_processing.py
@@ -281,7 +281,7 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
     prev_header_line = []
     header_line = []
     for line_i, line in enumerate(patch_lines):
-        if 'no newline at end of file' in line.lower().strip().strip('//'):
+        if 'no newline at end of file' in line.lower():
             continue
 
         if line.startswith('@@'):
@@ -290,18 +290,19 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
             if match and (new_content_lines or old_content_lines):  # found a new hunk, split the previous lines
                 if prev_header_line:
                     patch_with_lines_str += f'\n{prev_header_line}\n'
+                is_plus_lines = is_minus_lines = False
                 if new_content_lines:
                     is_plus_lines = any([line.startswith('+') for line in new_content_lines])
-                    if is_plus_lines:
-                        patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__new hunk__\n'
-                        for i, line_new in enumerate(new_content_lines):
-                            patch_with_lines_str += f"{start2 + i} {line_new}\n"
                 if old_content_lines:
                     is_minus_lines = any([line.startswith('-') for line in old_content_lines])
-                    if is_minus_lines:
-                        patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__old hunk__\n'
-                        for line_old in old_content_lines:
-                            patch_with_lines_str += f"{line_old}\n"
+                if is_plus_lines or is_minus_lines: # notice 'True' here - we always present __new hunk__ for section, otherwise LLM gets confused
+                    patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__new hunk__\n'
+                    for i, line_new in enumerate(new_content_lines):
+                        patch_with_lines_str += f"{start2 + i} {line_new}\n"
+                if is_minus_lines:
+                    patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__old hunk__\n'
+                    for line_old in old_content_lines:
+                        patch_with_lines_str += f"{line_old}\n"
                 new_content_lines = []
                 old_content_lines = []
             if match:
@@ -325,18 +326,19 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
     # finishing last hunk
     if match and new_content_lines:
         patch_with_lines_str += f'\n{header_line}\n'
+        is_plus_lines = is_minus_lines = False
         if new_content_lines:
             is_plus_lines = any([line.startswith('+') for line in new_content_lines])
-            if is_plus_lines:
-                patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__new hunk__\n'
-                for i, line_new in enumerate(new_content_lines):
-                    patch_with_lines_str += f"{start2 + i} {line_new}\n"
         if old_content_lines:
             is_minus_lines = any([line.startswith('-') for line in old_content_lines])
-            if is_minus_lines:
-                patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__old hunk__\n'
-                for line_old in old_content_lines:
-                    patch_with_lines_str += f"{line_old}\n"
+        if is_plus_lines or is_minus_lines:  # notice 'True' here - we always present __new hunk__ for section, otherwise LLM gets confused
+            patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__new hunk__\n'
+            for i, line_new in enumerate(new_content_lines):
+                patch_with_lines_str += f"{start2 + i} {line_new}\n"
+        if is_minus_lines:
+            patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__old hunk__\n'
+            for line_old in old_content_lines:
+                patch_with_lines_str += f"{line_old}\n"
 
     return patch_with_lines_str.rstrip()
 
diff --git a/pr_agent/settings/pr_code_suggestions_prompts.toml b/pr_agent/settings/pr_code_suggestions_prompts.toml
index 40e0ef577..5bf15f595 100644
--- a/pr_agent/settings/pr_code_suggestions_prompts.toml
+++ b/pr_agent/settings/pr_code_suggestions_prompts.toml
@@ -26,16 +26,15 @@ __old hunk__
 
 @@ ... @@ def func2():
 __new hunk__
-...
-__old hunk__
-...
-
+ unchanged code line4
++new code line5 removed in the PR
+ unchanged code line6
 
 ## File: 'src/file2.py'
 ...
 ======
 
-- In the format above, the diff is organized into separate '__new hunk__' and '__old hunk__' sections for each code chunk. '__new hunk__' contains the updated code, while '__old hunk__' shows the removed code. If no code was added or removed in a specific chunk, the corresponding section will be omitted.
+- In the format above, the diff is organized into separate '__new hunk__' and '__old hunk__' sections for each code chunk. '__new hunk__' contains the updated code, while '__old hunk__' shows the removed code. If no code was removed in a specific chunk, the __old hunk__ section will be omitted.
 - Line numbers were added for the '__new hunk__' sections to help referencing specific lines in the code suggestions. These numbers are for reference only and are not part of the actual code.
 - Code lines are prefixed with symbols: '+' for new code added in the PR, '-' for code removed, and ' ' for unchanged code.
 {%- if is_ai_metadata %}
@@ -49,7 +48,7 @@ Specific guidelines for generating code suggestions:
 - Prioritize suggestions that address potential issues, critical problems, and bugs in the PR code. Avoid repeating changes already implemented in the PR. If no pertinent suggestions are applicable, return an empty list.
 - Don't suggest to add docstring, type hints, or comments, to remove unused imports, or to use more specific exception types.
 - When referencing variables or names from the code, enclose them in backticks (`). Example: "ensure that `variable_name` is..."
-- Be mindful you are viewing a partial PR code diff, not the full codebase. Avoid suggestions that might conflict with unseen code or alerting on variables not declared in the visible scope, as the context is incomplete.
+- Be mindful you are viewing a partial PR code diff, not the full codebase. Avoid suggestions that might conflict with unseen code or alerting variables not declared in the visible scope, as the context is incomplete.
 
 
 {%- if extra_instructions %}
diff --git a/pr_agent/settings/pr_code_suggestions_reflect_prompts.toml b/pr_agent/settings/pr_code_suggestions_reflect_prompts.toml
index e029269b9..512ec592d 100644
--- a/pr_agent/settings/pr_code_suggestions_reflect_prompts.toml
+++ b/pr_agent/settings/pr_code_suggestions_reflect_prompts.toml
@@ -29,6 +29,14 @@ Key guidelines for evaluation:
   - Avoid inflating scores for suggestions that, while correct, offer only marginal improvements or optimizations.
 - Maintain the original order of suggestions in your feedback, corresponding to their input sequence.
 
+Additional scoring considerations:
+- If the suggestion is not actionable, and only asks the user to verify or ensure a change, reduce its score by 1-2 points.
+- Assign a score of 0 to suggestions aiming at:
+   - Adding docstring, type hints, or comments
+   - Remove unused imports or variables
+   - Using more specific exception types.
+
+
 
 The PR code diff will be presented in the following structured format:
 ======
diff --git a/pr_agent/settings/pr_reviewer_prompts.toml b/pr_agent/settings/pr_reviewer_prompts.toml
index 56e93ce5b..e3b4bfe48 100644
--- a/pr_agent/settings/pr_reviewer_prompts.toml
+++ b/pr_agent/settings/pr_reviewer_prompts.toml
@@ -32,16 +32,15 @@ __old hunk__
 
 @@ ... @@ def func2():
 __new hunk__
-...
-__old hunk__
-...
-
+ unchanged code line4
++new code line5 removed in the PR
+ unchanged code line6
 
 ## File: 'src/file2.py'
 ...
 ======
 
-- In this format, we separated each hunk of diff code to '__new hunk__' and '__old hunk__' sections. The '__new hunk__' section contains the new code of the chunk, and the '__old hunk__' section contains the old code, that was removed. If no new code was added in a specific hunk, '__new hunk__' section will not be presented. If no code was removed, '__old hunk__' section will not be presented.
+- In the format above, the diff is organized into separate '__new hunk__' and '__old hunk__' sections for each code chunk. '__new hunk__' contains the updated code, while '__old hunk__' shows the removed code. If no code was removed in a specific chunk, the __old hunk__ section will be omitted.
 - We also added line numbers for the '__new hunk__' code, to help you refer to the code lines in your suggestions. These line numbers are not part of the actual code, and should only used for reference.
 - Code lines are prefixed with symbols ('+', '-', ' '). The '+' symbol indicates new code added in the PR, the '-' symbol indicates code removed in the PR, and the ' ' symbol indicates unchanged code. \
  The review should address new code added in the PR code diff (lines starting with '+')