Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding numbered hunks and code suggestions feature #50

Merged
merged 1 commit into from
Jul 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions pr_agent/algo/git_patch_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,78 @@ def handle_patch_deletions(patch: str, original_file_content_str: str,
logging.info(f"Processing file: {file_name}, hunks were deleted")
patch = patch_new
return patch


def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The convert_to_hunks_with_lines_numbers function could be optimized. Currently, it iterates over the patch lines multiple times, which could be inefficient for large patches. Consider refactoring the function to reduce the number of iterations. [medium]

# toDO: (maybe remove '-' and '+' from the beginning of the line)
"""
## src/file.ts
--new hunk--
881 line1
882 line2
883 line3
884 line4
885 line6
886 line7
887 + line8
888 + line9
889 line10
890 line11
...
--old hunk--
line1
line2
- line3
- line4
line5
line6
...

"""
patch_with_lines_str = f"## {file.filename}\n"
import re
patch_lines = patch.splitlines()
RE_HUNK_HEADER = re.compile(
r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
new_content_lines = []
old_content_lines = []
match = None
start1, size1, start2, size2 = -1, -1, -1, -1
for line in patch_lines:
if 'no newline at end of file' in line.lower():
continue

if line.startswith('@@'):
match = RE_HUNK_HEADER.match(line)
if match and new_content_lines: # found a new hunk, split the previous lines
if new_content_lines:
patch_with_lines_str += '\n--new hunk--\n'
for i, line_new in enumerate(new_content_lines):
patch_with_lines_str += f"{start2 + i} {line_new}\n"
if old_content_lines:
patch_with_lines_str += '--old hunk--\n'
for i, line_old in enumerate(old_content_lines):
patch_with_lines_str += f"{line_old}\n"
new_content_lines = []
old_content_lines = []
start1, size1, start2, size2 = map(int, match.groups()[:4])
elif line.startswith('+'):
new_content_lines.append(line)
elif line.startswith('-'):
old_content_lines.append(line)
else:
new_content_lines.append(line)
mrT23 marked this conversation as resolved.
Show resolved Hide resolved
old_content_lines.append(line)

# finishing last hunk
if match and new_content_lines:
if new_content_lines:
patch_with_lines_str += '\n--new hunk--\n'
for i, line_new in enumerate(new_content_lines):
patch_with_lines_str += f"{start2 + i} {line_new}\n"
if old_content_lines:
patch_with_lines_str += '\n--old hunk--\n'
for i, line_old in enumerate(old_content_lines):
patch_with_lines_str += f"{line_old}\n"

Comment on lines +113 to +184
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: The convert_to_hunks_with_lines_numbers function is quite long and does multiple things. Consider breaking it down into smaller, more manageable functions. This will make the code easier to read and maintain.

Suggested change
def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
# toDO: (maybe remove '-' and '+' from the beginning of the line)
"""
## src/file.ts
--new hunk--
881 line1
882 line2
883 line3
884 line4
885 line6
886 line7
887 + line8
888 + line9
889 line10
890 line11
...
--old hunk--
line1
line2
- line3
- line4
line5
line6
...
"""
patch_with_lines_str = f"## {file.filename}\n"
import re
patch_lines = patch.splitlines()
RE_HUNK_HEADER = re.compile(
r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
new_content_lines = []
old_content_lines = []
match = None
start1, size1, start2, size2 = -1, -1, -1, -1
for line in patch_lines:
if 'no newline at end of file' in line.lower():
continue
if line.startswith('@@'):
match = RE_HUNK_HEADER.match(line)
if match and new_content_lines: # found a new hunk, split the previous lines
if new_content_lines:
patch_with_lines_str += '\n--new hunk--\n'
for i, line_new in enumerate(new_content_lines):
patch_with_lines_str += f"{start2 + i} {line_new}\n"
if old_content_lines:
patch_with_lines_str += '--old hunk--\n'
for i, line_old in enumerate(old_content_lines):
patch_with_lines_str += f"{line_old}\n"
new_content_lines = []
old_content_lines = []
start1, size1, start2, size2 = map(int, match.groups()[:4])
elif line.startswith('+'):
new_content_lines.append(line)
elif line.startswith('-'):
old_content_lines.append(line)
else:
new_content_lines.append(line)
old_content_lines.append(line)
# finishing last hunk
if match and new_content_lines:
if new_content_lines:
patch_with_lines_str += '\n--new hunk--\n'
for i, line_new in enumerate(new_content_lines):
patch_with_lines_str += f"{start2 + i} {line_new}\n"
if old_content_lines:
patch_with_lines_str += '\n--old hunk--\n'
for i, line_old in enumerate(old_content_lines):
patch_with_lines_str += f"{line_old}\n"
def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
# toDO: (maybe remove '-' and '+' from the beginning of the line)
...
patch_with_lines_str = process_patch_lines(patch_lines)
return patch_with_lines_str.strip()
def process_patch_lines(patch_lines):
...
return patch_with_lines_str

return patch_with_lines_str.strip()
36 changes: 28 additions & 8 deletions pr_agent/algo/pr_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import logging
from typing import Any, Tuple, Union

from pr_agent.algo.git_patch_processing import extend_patch, handle_patch_deletions
from pr_agent.algo.git_patch_processing import extend_patch, handle_patch_deletions, \
convert_to_hunks_with_lines_numbers
from pr_agent.algo.language_handler import sort_files_by_main_languages
from pr_agent.algo.token_handler import TokenHandler
from pr_agent.config_loader import settings
Expand All @@ -19,26 +20,33 @@
PATCH_EXTRA_LINES = 3


def get_pr_diff(git_provider: Union[GithubProvider, Any], token_handler: TokenHandler) -> str:
def get_pr_diff(git_provider: Union[GithubProvider, Any], token_handler: TokenHandler,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider refactoring the get_pr_diff function to avoid the use of global variables. Instead of modifying the global PATCH_EXTRA_LINES variable, you could pass it as a parameter to the functions that need it. This would make the code more modular and easier to test. [important]

add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool =False) -> str:
Comment on lines +23 to +24
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: Consider using a class or a data structure to encapsulate the parameters of the get_pr_diff function. This will make the function signature cleaner and easier to manage as the number of parameters grows.

Suggested change
def get_pr_diff(git_provider: Union[GithubProvider, Any], token_handler: TokenHandler,
add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool =False) -> str:
class PrDiffParams:
def __init__(self, git_provider, token_handler, add_line_numbers_to_hunks=False, disable_extra_lines=False):
self.git_provider = git_provider
self.token_handler = token_handler
self.add_line_numbers_to_hunks = add_line_numbers_to_hunks
self.disable_extra_lines = disable_extra_lines
def get_pr_diff(params: PrDiffParams) -> str:

"""
Returns a string with the diff of the PR.
If needed, apply diff minimization techniques to reduce the number of tokens
"""
if disable_extra_lines:
global PATCH_EXTRA_LINES
PATCH_EXTRA_LINES = 0

git_provider.pr.diff_files = list(git_provider.get_diff_files())

# get pr languages
pr_languages = sort_files_by_main_languages(git_provider.get_languages(), git_provider.pr.diff_files)

# generate a standard diff string, with patch extension
patches_extended, total_tokens = pr_generate_extended_diff(pr_languages, token_handler)
patches_extended, total_tokens = pr_generate_extended_diff(pr_languages, token_handler,
add_line_numbers_to_hunks)
mrT23 marked this conversation as resolved.
Show resolved Hide resolved
mrT23 marked this conversation as resolved.
Show resolved Hide resolved

# if we are under the limit, return the full diff
if total_tokens + OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD < token_handler.limit:
return "\n".join(patches_extended)

# if we are over the limit, start pruning
patches_compressed, modified_file_names, deleted_file_names = pr_generate_compressed_diff(pr_languages,
token_handler)
patches_compressed, modified_file_names, deleted_file_names = \
pr_generate_compressed_diff(pr_languages, token_handler, add_line_numbers_to_hunks)

final_diff = "\n".join(patches_compressed)
if modified_file_names:
modified_list_str = MORE_MODIFIED_FILES_ + "\n".join(modified_file_names)
Expand All @@ -49,7 +57,8 @@ def get_pr_diff(git_provider: Union[GithubProvider, Any], token_handler: TokenHa
return final_diff


def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) -> \
def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler,
add_line_numbers_to_hunks: bool) -> \
Tuple[list, int]:
"""
Generate a standard diff string, with patch extension
Expand All @@ -72,6 +81,9 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) -
extended_patch = extend_patch(original_file_content_str, patch, num_lines=PATCH_EXTRA_LINES)
full_extended_patch = f"## {file.filename}\n\n{extended_patch}\n"

if add_line_numbers_to_hunks:
full_extended_patch = convert_to_hunks_with_lines_numbers(extended_patch, file)

patch_tokens = token_handler.count_tokens(full_extended_patch)
file.tokens = patch_tokens
total_tokens += patch_tokens
Expand All @@ -80,7 +92,8 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) -
return patches_extended, total_tokens

mrT23 marked this conversation as resolved.
Show resolved Hide resolved

def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> Tuple[list, list, list]:
def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler,
convert_hunks_to_line_numbers: bool) -> Tuple[list, list, list]:
# Apply Diff Minimization techniques to reduce the number of tokens:
# 0. Start from the largest diff patch to smaller ones
# 1. Don't use extend context lines around diff
Expand Down Expand Up @@ -114,6 +127,10 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) ->
deleted_files_list.append(file.filename)
total_tokens += token_handler.count_tokens(file.filename) + 1
continue

if convert_hunks_to_line_numbers:
patch = convert_to_hunks_with_lines_numbers(patch, file)

new_patch_tokens = token_handler.count_tokens(patch)

# Hard Stop, no more tokens
Expand All @@ -135,7 +152,10 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) ->
continue

if patch:
patch_final = f"## {file.filename}\n\n{patch}\n"
if not convert_hunks_to_line_numbers:
patch_final = f"## {file.filename}\n\n{patch}\n"
else:
patch_final = patch
patches.append(patch_final)
total_tokens += token_handler.count_tokens(patch_final)
if settings.config.verbosity_level >= 2:
Expand Down
8 changes: 7 additions & 1 deletion pr_agent/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
import os

from pr_agent.tools.pr_code_suggestions import PRCodeSuggestions
from pr_agent.tools.pr_description import PRDescription
from pr_agent.tools.pr_questions import PRQuestions
from pr_agent.tools.pr_reviewer import PRReviewer
Expand All @@ -12,7 +13,8 @@ def run():
parser = argparse.ArgumentParser(description='AI based pull request analyzer')
parser.add_argument('--pr_url', type=str, help='The URL of the PR to review', required=True)
parser.add_argument('--question', type=str, help='Optional question to ask', required=False)
parser.add_argument('--pr_description', action='store_true', help='Optional question to ask', required=False)
parser.add_argument('--pr_description', action='store_true', required=False)
parser.add_argument('--pr_code_suggestions', action='store_true', required=False)
Comment on lines +16 to +17
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: Consider using subcommands instead of flags for the different modes of operation (pr_description, pr_code_suggestions). This will make the command line interface more intuitive and easier to use.

Suggested change
parser.add_argument('--pr_description', action='store_true', required=False)
parser.add_argument('--pr_code_suggestions', action='store_true', required=False)
subparsers = parser.add_subparsers(dest='command')
pr_description_parser = subparsers.add_parser('pr_description')
pr_code_suggestions_parser = subparsers.add_parser('pr_code_suggestions')

args = parser.parse_args()
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
if args.question:
Expand All @@ -23,6 +25,10 @@ def run():
print(f"PR description: {args.pr_url}")
reviewer = PRDescription(args.pr_url)
asyncio.run(reviewer.describe())
elif args.pr_code_suggestions:
print(f"PR code suggestions: {args.pr_url}")
reviewer = PRCodeSuggestions(args.pr_url)
asyncio.run(reviewer.suggest())
else:
print(f"Reviewing PR: {args.pr_url}")
reviewer = PRReviewer(args.pr_url, cli_mode=True)
mrT23 marked this conversation as resolved.
Show resolved Hide resolved
Expand Down
1 change: 1 addition & 0 deletions pr_agent/config_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"settings/pr_reviewer_prompts.toml",
"settings/pr_questions_prompts.toml",
"settings/pr_description_prompts.toml",
"settings/pr_code_suggestions_prompts.toml",
"settings_prod/.secrets.toml"
]]
)
5 changes: 5 additions & 0 deletions pr_agent/git_providers/git_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ def publish_comment(self, pr_comment: str, is_temporary: bool = False):
def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str):
pass

@abstractmethod
def publish_code_suggestion(self, body: str, relevant_file: str,
relevant_lines_start: int, relevant_lines_end: int):
pass

@abstractmethod
def remove_initial_comment(self):
pass
Expand Down
51 changes: 49 additions & 2 deletions pr_agent/git_providers/github_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@

from pr_agent.config_loader import settings

from .git_provider import FilePatchInfo
from .git_provider import FilePatchInfo, GitProvider


class GithubProvider:
class GithubProvider(GitProvider):
def __init__(self, pr_url: Optional[str] = None):
self.installation_id = settings.get("GITHUB.INSTALLATION_ID")
self.github_client = self._get_github_client()
Expand Down Expand Up @@ -76,6 +76,53 @@ def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in
path = relevant_file.strip()
self.pr.create_review_comment(body=body, commit_id=self.last_commit_id, path=path, position=position)

def publish_code_suggestion(self, body: str,
relevant_file: str,
relevant_lines_start: int,
relevant_lines_end: int):
if not relevant_lines_start or relevant_lines_start == -1:
if settings.config.verbosity_level >= 2:
logging.exception(f"Failed to publish code suggestion, relevant_lines_start is {relevant_lines_start}")
return False

if relevant_lines_end<relevant_lines_start:
if settings.config.verbosity_level >= 2:
logging.exception(f"Failed to publish code suggestion, "
f"relevant_lines_end is {relevant_lines_end} and "
f"relevant_lines_start is {relevant_lines_start}")
return False

try:
import github.PullRequestComment
if relevant_lines_end > relevant_lines_start:
post_parameters = {
"body": body,
"commit_id": self.last_commit_id._identity,
"path": relevant_file,
"line": relevant_lines_end,
"start_line": relevant_lines_start,
"start_side": "RIGHT",
}
else: # API is different for single line comments
post_parameters = {
"body": body,
"commit_id": self.last_commit_id._identity,
"path": relevant_file,
"line": relevant_lines_start,
"side": "RIGHT",
}
headers, data = self.pr._requester.requestJsonAndCheck(
"POST", f"{self.pr.url}/comments", input=post_parameters
)
github.PullRequestComment.PullRequestComment(
self.pr._requester, headers, data, completed=True
)
return True
except Exception as e:
if settings.config.verbosity_level >= 2:
logging.error(f"Failed to publish code suggestion, error: {e}")
return False

def remove_initial_comment(self):
try:
for comment in self.pr.comments_list:
Expand Down
6 changes: 6 additions & 0 deletions pr_agent/git_providers/gitlab_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,12 @@ def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in
self.mr.discussions.create({'body': body,
'position': pos_obj})

def publish_code_suggestion(self, body: str,
relevant_file: str,
relevant_lines_start: int,
relevant_lines_end: int):
raise "not implemented yet for gitlab"

def search_line(self, relevant_file, relevant_line_in_file):
RE_HUNK_HEADER = re.compile(
r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
Expand Down
5 changes: 4 additions & 1 deletion pr_agent/settings/configuration.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ verbosity_level=0 # 0,1,2
require_focused_review=true
require_tests_review=true
require_security_review=true
num_code_suggestions=4
num_code_suggestions=3
inline_code_comments = true

[pr_questions]

[pr_code_suggestions]
num_code_suggestions=4

[github]
# The type of deployment to create. Valid values are 'app' or 'user'.
deployment_type = "user"
Expand Down
Loading