diff --git a/pr_agent/algo/git_patch_processing.py b/pr_agent/algo/git_patch_processing.py index 6c450a301..87761e1ba 100644 --- a/pr_agent/algo/git_patch_processing.py +++ b/pr_agent/algo/git_patch_processing.py @@ -108,3 +108,78 @@ def handle_patch_deletions(patch: str, original_file_content_str: str, logging.info(f"Processing file: {file_name}, hunks were deleted") patch = patch_new return patch + + +def convert_to_hunks_with_lines_numbers(patch: str, file) -> str: + # toDO: (maybe remove '-' and '+' from the beginning of the line) + """ + ## src/file.ts +--new hunk-- +881 line1 +882 line2 +883 line3 +884 line4 +885 line6 +886 line7 +887 + line8 +888 + line9 +889 line10 +890 line11 +... +--old hunk-- + line1 + line2 +- line3 +- line4 + line5 + line6 + ... + + """ + patch_with_lines_str = f"## {file.filename}\n" + import re + patch_lines = patch.splitlines() + RE_HUNK_HEADER = re.compile( + r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)") + new_content_lines = [] + old_content_lines = [] + match = None + start1, size1, start2, size2 = -1, -1, -1, -1 + for line in patch_lines: + if 'no newline at end of file' in line.lower(): + continue + + if line.startswith('@@'): + match = RE_HUNK_HEADER.match(line) + if match and new_content_lines: # found a new hunk, split the previous lines + if new_content_lines: + patch_with_lines_str += '\n--new hunk--\n' + for i, line_new in enumerate(new_content_lines): + patch_with_lines_str += f"{start2 + i} {line_new}\n" + if old_content_lines: + patch_with_lines_str += '--old hunk--\n' + for i, line_old in enumerate(old_content_lines): + patch_with_lines_str += f"{line_old}\n" + new_content_lines = [] + old_content_lines = [] + start1, size1, start2, size2 = map(int, match.groups()[:4]) + elif line.startswith('+'): + new_content_lines.append(line) + elif line.startswith('-'): + old_content_lines.append(line) + else: + new_content_lines.append(line) + old_content_lines.append(line) + + # finishing last hunk + if match and new_content_lines: + if new_content_lines: + patch_with_lines_str += '\n--new hunk--\n' + for i, line_new in enumerate(new_content_lines): + patch_with_lines_str += f"{start2 + i} {line_new}\n" + if old_content_lines: + patch_with_lines_str += '\n--old hunk--\n' + for i, line_old in enumerate(old_content_lines): + patch_with_lines_str += f"{line_old}\n" + + return patch_with_lines_str.strip() diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py index f99d126cf..3ea79676a 100644 --- a/pr_agent/algo/pr_processing.py +++ b/pr_agent/algo/pr_processing.py @@ -4,7 +4,8 @@ import logging from typing import Any, Tuple, Union -from pr_agent.algo.git_patch_processing import extend_patch, handle_patch_deletions +from pr_agent.algo.git_patch_processing import extend_patch, handle_patch_deletions, \ + convert_to_hunks_with_lines_numbers from pr_agent.algo.language_handler import sort_files_by_main_languages from pr_agent.algo.token_handler import TokenHandler from pr_agent.config_loader import settings @@ -19,26 +20,33 @@ PATCH_EXTRA_LINES = 3 -def get_pr_diff(git_provider: Union[GithubProvider, Any], token_handler: TokenHandler) -> str: +def get_pr_diff(git_provider: Union[GithubProvider, Any], token_handler: TokenHandler, + add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool =False) -> str: """ Returns a string with the diff of the PR. If needed, apply diff minimization techniques to reduce the number of tokens """ + if disable_extra_lines: + global PATCH_EXTRA_LINES + PATCH_EXTRA_LINES = 0 + git_provider.pr.diff_files = list(git_provider.get_diff_files()) # get pr languages pr_languages = sort_files_by_main_languages(git_provider.get_languages(), git_provider.pr.diff_files) # generate a standard diff string, with patch extension - patches_extended, total_tokens = pr_generate_extended_diff(pr_languages, token_handler) + patches_extended, total_tokens = pr_generate_extended_diff(pr_languages, token_handler, + add_line_numbers_to_hunks) # if we are under the limit, return the full diff if total_tokens + OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD < token_handler.limit: return "\n".join(patches_extended) # if we are over the limit, start pruning - patches_compressed, modified_file_names, deleted_file_names = pr_generate_compressed_diff(pr_languages, - token_handler) + patches_compressed, modified_file_names, deleted_file_names = \ + pr_generate_compressed_diff(pr_languages, token_handler, add_line_numbers_to_hunks) + final_diff = "\n".join(patches_compressed) if modified_file_names: modified_list_str = MORE_MODIFIED_FILES_ + "\n".join(modified_file_names) @@ -49,7 +57,8 @@ def get_pr_diff(git_provider: Union[GithubProvider, Any], token_handler: TokenHa return final_diff -def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) -> \ +def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler, + add_line_numbers_to_hunks: bool) -> \ Tuple[list, int]: """ Generate a standard diff string, with patch extension @@ -72,6 +81,9 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) - extended_patch = extend_patch(original_file_content_str, patch, num_lines=PATCH_EXTRA_LINES) full_extended_patch = f"## {file.filename}\n\n{extended_patch}\n" + if add_line_numbers_to_hunks: + full_extended_patch = convert_to_hunks_with_lines_numbers(extended_patch, file) + patch_tokens = token_handler.count_tokens(full_extended_patch) file.tokens = patch_tokens total_tokens += patch_tokens @@ -80,7 +92,8 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler) - return patches_extended, total_tokens -def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> Tuple[list, list, list]: +def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, + convert_hunks_to_line_numbers: bool) -> Tuple[list, list, list]: # Apply Diff Minimization techniques to reduce the number of tokens: # 0. Start from the largest diff patch to smaller ones # 1. Don't use extend context lines around diff @@ -114,6 +127,10 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> deleted_files_list.append(file.filename) total_tokens += token_handler.count_tokens(file.filename) + 1 continue + + if convert_hunks_to_line_numbers: + patch = convert_to_hunks_with_lines_numbers(patch, file) + new_patch_tokens = token_handler.count_tokens(patch) # Hard Stop, no more tokens @@ -135,7 +152,10 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler) -> continue if patch: - patch_final = f"## {file.filename}\n\n{patch}\n" + if not convert_hunks_to_line_numbers: + patch_final = f"## {file.filename}\n\n{patch}\n" + else: + patch_final = patch patches.append(patch_final) total_tokens += token_handler.count_tokens(patch_final) if settings.config.verbosity_level >= 2: diff --git a/pr_agent/cli.py b/pr_agent/cli.py index 8b2800654..78a795ea8 100644 --- a/pr_agent/cli.py +++ b/pr_agent/cli.py @@ -3,6 +3,7 @@ import logging import os +from pr_agent.tools.pr_code_suggestions import PRCodeSuggestions from pr_agent.tools.pr_description import PRDescription from pr_agent.tools.pr_questions import PRQuestions from pr_agent.tools.pr_reviewer import PRReviewer @@ -12,7 +13,8 @@ def run(): parser = argparse.ArgumentParser(description='AI based pull request analyzer') parser.add_argument('--pr_url', type=str, help='The URL of the PR to review', required=True) parser.add_argument('--question', type=str, help='Optional question to ask', required=False) - parser.add_argument('--pr_description', action='store_true', help='Optional question to ask', required=False) + parser.add_argument('--pr_description', action='store_true', required=False) + parser.add_argument('--pr_code_suggestions', action='store_true', required=False) args = parser.parse_args() logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) if args.question: @@ -23,6 +25,10 @@ def run(): print(f"PR description: {args.pr_url}") reviewer = PRDescription(args.pr_url) asyncio.run(reviewer.describe()) + elif args.pr_code_suggestions: + print(f"PR code suggestions: {args.pr_url}") + reviewer = PRCodeSuggestions(args.pr_url) + asyncio.run(reviewer.suggest()) else: print(f"Reviewing PR: {args.pr_url}") reviewer = PRReviewer(args.pr_url, cli_mode=True) diff --git a/pr_agent/config_loader.py b/pr_agent/config_loader.py index ed98fccd4..18ec67468 100644 --- a/pr_agent/config_loader.py +++ b/pr_agent/config_loader.py @@ -12,6 +12,7 @@ "settings/pr_reviewer_prompts.toml", "settings/pr_questions_prompts.toml", "settings/pr_description_prompts.toml", + "settings/pr_code_suggestions_prompts.toml", "settings_prod/.secrets.toml" ]] ) diff --git a/pr_agent/git_providers/git_provider.py b/pr_agent/git_providers/git_provider.py index dd8e4fcb1..ae39cc74c 100644 --- a/pr_agent/git_providers/git_provider.py +++ b/pr_agent/git_providers/git_provider.py @@ -37,6 +37,11 @@ def publish_comment(self, pr_comment: str, is_temporary: bool = False): def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str): pass + @abstractmethod + def publish_code_suggestion(self, body: str, relevant_file: str, + relevant_lines_start: int, relevant_lines_end: int): + pass + @abstractmethod def remove_initial_comment(self): pass diff --git a/pr_agent/git_providers/github_provider.py b/pr_agent/git_providers/github_provider.py index af035c9f7..00d5722b8 100644 --- a/pr_agent/git_providers/github_provider.py +++ b/pr_agent/git_providers/github_provider.py @@ -7,10 +7,10 @@ from pr_agent.config_loader import settings -from .git_provider import FilePatchInfo +from .git_provider import FilePatchInfo, GitProvider -class GithubProvider: +class GithubProvider(GitProvider): def __init__(self, pr_url: Optional[str] = None): self.installation_id = settings.get("GITHUB.INSTALLATION_ID") self.github_client = self._get_github_client() @@ -76,6 +76,53 @@ def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in path = relevant_file.strip() self.pr.create_review_comment(body=body, commit_id=self.last_commit_id, path=path, position=position) + def publish_code_suggestion(self, body: str, + relevant_file: str, + relevant_lines_start: int, + relevant_lines_end: int): + if not relevant_lines_start or relevant_lines_start == -1: + if settings.config.verbosity_level >= 2: + logging.exception(f"Failed to publish code suggestion, relevant_lines_start is {relevant_lines_start}") + return False + + if relevant_lines_end= 2: + logging.exception(f"Failed to publish code suggestion, " + f"relevant_lines_end is {relevant_lines_end} and " + f"relevant_lines_start is {relevant_lines_start}") + return False + + try: + import github.PullRequestComment + if relevant_lines_end > relevant_lines_start: + post_parameters = { + "body": body, + "commit_id": self.last_commit_id._identity, + "path": relevant_file, + "line": relevant_lines_end, + "start_line": relevant_lines_start, + "start_side": "RIGHT", + } + else: # API is different for single line comments + post_parameters = { + "body": body, + "commit_id": self.last_commit_id._identity, + "path": relevant_file, + "line": relevant_lines_start, + "side": "RIGHT", + } + headers, data = self.pr._requester.requestJsonAndCheck( + "POST", f"{self.pr.url}/comments", input=post_parameters + ) + github.PullRequestComment.PullRequestComment( + self.pr._requester, headers, data, completed=True + ) + return True + except Exception as e: + if settings.config.verbosity_level >= 2: + logging.error(f"Failed to publish code suggestion, error: {e}") + return False + def remove_initial_comment(self): try: for comment in self.pr.comments_list: diff --git a/pr_agent/git_providers/gitlab_provider.py b/pr_agent/git_providers/gitlab_provider.py index a04a482a0..33045d2d4 100644 --- a/pr_agent/git_providers/gitlab_provider.py +++ b/pr_agent/git_providers/gitlab_provider.py @@ -106,6 +106,12 @@ def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in self.mr.discussions.create({'body': body, 'position': pos_obj}) + def publish_code_suggestion(self, body: str, + relevant_file: str, + relevant_lines_start: int, + relevant_lines_end: int): + raise "not implemented yet for gitlab" + def search_line(self, relevant_file, relevant_line_in_file): RE_HUNK_HEADER = re.compile( r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)") diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml index 098969735..6812ed71a 100644 --- a/pr_agent/settings/configuration.toml +++ b/pr_agent/settings/configuration.toml @@ -8,11 +8,14 @@ verbosity_level=0 # 0,1,2 require_focused_review=true require_tests_review=true require_security_review=true -num_code_suggestions=4 +num_code_suggestions=3 inline_code_comments = true [pr_questions] +[pr_code_suggestions] +num_code_suggestions=4 + [github] # The type of deployment to create. Valid values are 'app' or 'user'. deployment_type = "user" diff --git a/pr_agent/settings/pr_code_suggestions_prompts.toml b/pr_agent/settings/pr_code_suggestions_prompts.toml new file mode 100644 index 000000000..c8619ca52 --- /dev/null +++ b/pr_agent/settings/pr_code_suggestions_prompts.toml @@ -0,0 +1,79 @@ +[pr_code_suggestions_prompt] +system="""You are a language model called CodiumAI-PR-Code-Reviewer. +Your task is to provide provide meaningfull non-trivial code suggestions to improve the new code in a PR (the '+' lines). +- Try to give important suggestions like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningfull code improvements, like performance, vulnerability, modularity, and best practices. +- Suggestions should refer only to the 'new hunk' code, and focus on improving the new added code lines, with '+'. +- Provide the exact line number range (inclusive) for each issue. +- Assume there is additional code in the relevant file that is not included in the diff. +- Provide up to {{ num_code_suggestions }} code suggestions. +- Make sure not to provide suggestion repeating modifications already implemented in the new PR code (the '+' lines). +- Don't output line numbers in the 'improved code' snippets. + +You must use the following JSON schema to format your answer: +```json +{ + "Code suggestions": { + "type": "array", + "minItems": 1, + "maxItems": {{ num_code_suggestions }}, + "uniqueItems": "true", + "items": { + "relevant file": { + "type": "string", + "description": "the relevant file full path" + }, + "suggestion content": { + "type": "string", + "description": "a concrete suggestion for meaningfully improving the new PR code." + }, + "existing code": { + "type": "string", + "description": "a code snippet showing authentic relevant code lines from a 'new hunk' section. It must be continuous, correctly formatted and indented, and without line numbers." + }, + "relevant lines": { + "type": "string", + "description": "the relevant lines in the 'new hunk' sections, in the format of 'start_line-end_line'. For example: '10-15'. They should be derived from the hunk line numbers, and correspond to the 'existing code' snippet above." + }, + "improved code": { + "type": "string", + "description": "a new code snippet that can be used to replace the relevant lines in 'new hunk' code. Replacement suggestions should be complete, correctly formatted and indented, and without line numbers." + } + } + } +} +``` + +Example input: +' +## src/file1.py +---new_hunk--- +``` +[new hunk code, annotated with line numbers] +``` +---old_hunk--- +``` +[old hunk code] +``` +... +' + +Don't repeat the prompt in the answer, and avoid outputting the 'type' and 'description' fields. +""" + +user="""PR Info: +Title: '{{title}}' +Branch: '{{branch}}' +Description: '{{description}}' +{%- if language %} +Main language: {{language}} +{%- endif %} + + +The PR Diff: +``` +{{diff}} +``` + +Response (should be a valid JSON, and nothing else): +```json +""" diff --git a/pr_agent/tools/pr_code_suggestions.py b/pr_agent/tools/pr_code_suggestions.py new file mode 100644 index 000000000..d55326bc3 --- /dev/null +++ b/pr_agent/tools/pr_code_suggestions.py @@ -0,0 +1,127 @@ +import copy +import json +import logging +import textwrap + +from jinja2 import Environment, StrictUndefined + +from pr_agent.algo.ai_handler import AiHandler +from pr_agent.algo.pr_processing import get_pr_diff +from pr_agent.algo.token_handler import TokenHandler +from pr_agent.algo.utils import convert_to_markdown, try_fix_json +from pr_agent.config_loader import settings +from pr_agent.git_providers import get_git_provider, GithubProvider +from pr_agent.git_providers.git_provider import get_main_pr_language + + +class PRCodeSuggestions: + def __init__(self, pr_url: str, cli_mode=False): + + self.git_provider = get_git_provider()(pr_url) + self.main_language = get_main_pr_language( + self.git_provider.get_languages(), self.git_provider.get_files() + ) + self.ai_handler = AiHandler() + self.patches_diff = None + self.prediction = None + self.cli_mode = cli_mode + self.vars = { + "title": self.git_provider.pr.title, + "branch": self.git_provider.get_pr_branch(), + "description": self.git_provider.get_pr_description(), + "language": self.main_language, + "diff": "", # empty diff for initial calculation + 'num_code_suggestions': settings.pr_code_suggestions.num_code_suggestions, + } + self.token_handler = TokenHandler(self.git_provider.pr, + self.vars, + settings.pr_code_suggestions_prompt.system, + settings.pr_code_suggestions_prompt.user) + + async def suggest(self): + assert type(self.git_provider) == GithubProvider, "Only Github is supported for now" + + logging.info('Generating code suggestions for PR...') + if settings.config.publish_review: + self.git_provider.publish_comment("Preparing review...", is_temporary=True) + logging.info('Getting PR diff...') + + # we are using extended hunk with line numbers for code suggestions + self.patches_diff = get_pr_diff(self.git_provider, + self.token_handler, + add_line_numbers_to_hunks=True, + disable_extra_lines=True) + + logging.info('Getting AI prediction...') + self.prediction = await self._get_prediction() + logging.info('Preparing PR review...') + data = self._prepare_pr_code_suggestions() + if settings.config.publish_review: + logging.info('Pushing PR review...') + self.git_provider.remove_initial_comment() + logging.info('Pushing inline code comments...') + self.push_inline_code_suggestions(data) + + + async def _get_prediction(self): + variables = copy.deepcopy(self.vars) + variables["diff"] = self.patches_diff # update diff + environment = Environment(undefined=StrictUndefined) + system_prompt = environment.from_string(settings.pr_code_suggestions_prompt.system).render(variables) + user_prompt = environment.from_string(settings.pr_code_suggestions_prompt.user).render(variables) + if settings.config.verbosity_level >= 2: + logging.info(f"\nSystem prompt:\n{system_prompt}") + logging.info(f"\nUser prompt:\n{user_prompt}") + model = settings.config.model + response, finish_reason = await self.ai_handler.chat_completion(model=model, temperature=0.2, + system=system_prompt, user=user_prompt) + + return response + + def _prepare_pr_code_suggestions(self) -> str: + review = self.prediction.strip() + data = None + try: + data = json.loads(review) + except json.decoder.JSONDecodeError: + if settings.config.verbosity_level >= 2: + logging.info(f"Could not parse json response: {review}") + data = try_fix_json(review) + return data + + def push_inline_code_suggestions(self, data): + for d in data['Code suggestions']: + if settings.config.verbosity_level >= 2: + logging.info(f"suggestion: {d}") + relevant_file = d['relevant file'].strip() + relevant_lines_str = d['relevant lines'].strip() + relevant_lines_start = int(relevant_lines_str.split('-')[0]) # absolute position + relevant_lines_end = int(relevant_lines_str.split('-')[-1]) + content = d['suggestion content'] + existing_code_snippet = d['existing code'] + new_code_snippet = d['improved code'] + + if new_code_snippet: + try: # dedent code snippet + self.diff_files = self.git_provider.diff_files if self.git_provider.diff_files else self.git_provider.get_diff_files() + original_initial_line = None + for file in self.diff_files: + if file.filename.strip() == relevant_file: + original_initial_line = file.head_file.splitlines()[relevant_lines_start - 1] + break + if original_initial_line: + suggested_initial_line = new_code_snippet.splitlines()[0] + original_initial_spaces = len(original_initial_line) - len(original_initial_line.lstrip()) + suggested_initial_spaces = len(suggested_initial_line) - len(suggested_initial_line.lstrip()) + delta_spaces = original_initial_spaces - suggested_initial_spaces + if delta_spaces > 0: + new_code_snippet = textwrap.indent(new_code_snippet, delta_spaces * " ").rstrip('\n') + except Exception as e: + if settings.config.verbosity_level >= 2: + logging.info(f"Could not dedent code snippet for file {relevant_file}, error: {e}") + + body = f"**Suggestion:** {content}\n```suggestion\n" + new_code_snippet + "\n```" + success = self.git_provider.publish_code_suggestion(body=body, + relevant_file=relevant_file, + relevant_lines_start=relevant_lines_start, + relevant_lines_end=relevant_lines_end) diff --git a/pr_agent/tools/pr_description.py b/pr_agent/tools/pr_description.py index 334678e48..b5b2a0f78 100644 --- a/pr_agent/tools/pr_description.py +++ b/pr_agent/tools/pr_description.py @@ -35,7 +35,7 @@ def __init__(self, pr_url: str): self.prediction = None async def describe(self): - logging.info('Answering a PR question...') + logging.info('Generating a PR description...') if settings.config.publish_review: self.git_provider.publish_comment("Preparing pr description...", is_temporary=True) logging.info('Getting PR diff...')