diff --git a/.gitignore b/.gitignore index 074e028..a560505 100755 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,4 @@ unoplat-code-confluence/unoplat_code_confluence/utility/__pycache__ unoplat-code-confluence/unoplat_code_confluence/example_config.json unoplat-code-confluence/poetry.lock app.log +unoplat-code-confluence/unoplat_code_confluence/configuration/__pycache__ diff --git a/README.md b/README.md index 4dfb3e8..237eb44 100644 --- a/README.md +++ b/README.md @@ -120,12 +120,19 @@ Local workspace on your computer from https://github.com/DataStax-Examples/sprin "api_tokens": { "github_token": "your github pat for downloading arcguard" }, - "ai_tokens": { - "openai_api_key": "your openai api key" + "llm_provider_config": { + "openai": { + "api_key": "YourApiKey", + "model": "gpt-3.5-turbo-16k", + "model_type" : "chat", + "max_tokens": 1024, + "temperature": 0.0 } } ``` Configuration Note: Do not change the download_url and keep the programming_language to java (as right now only java is supported) +llm Provider Config: +- Model Providers Supported: ["openai","together","anyscale","anthropic"] For config inside llm_provider_config refer - [Dspy Model Provider Doc](https://dspy-docs.vercel.app/docs/category/remote-language-model-clients) Note: we have tried gpt3.5 turbo and it works well as data is precise for code understanding. Our experience with https://huggingface.co/01-ai/Yi-1.5-34B-Chat also has been great apart from hiccups with last level when codebase understand is being formed. Also this will get much better as currently all the dspy modules are uncompiled.We will be rolling out evaluated models and results post optimisation soon. Until then users can use 3.5turbo for decent results. diff --git a/unoplat-code-confluence/unoplat_code_confluence/__main__.py b/unoplat-code-confluence/unoplat_code_confluence/__main__.py index 5d7baf6..4f8e0e2 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/__main__.py +++ b/unoplat-code-confluence/unoplat_code_confluence/__main__.py @@ -4,6 +4,7 @@ import datetime from unoplat_code_confluence.codebaseparser.arc_guard_handler import ArchGuardHandler import re +from unoplat_code_confluence.configuration.external_config import AppConfig from unoplat_code_confluence.data_models.chapi_unoplat_codebase import UnoplatCodebase from unoplat_code_confluence.data_models.dspy.dspy_unoplat_codebase_summary import DspyUnoplatCodebaseSummary from unoplat_code_confluence.downloader.downloader import Downloader @@ -40,29 +41,12 @@ def start_pipeline(): def get_codebase_metadata(json_configuration_data,iload_json,iparse_json,isummariser): # Collect necessary inputs from the user to set up the codebase indexing - local_workspace_path = json_configuration_data["local_workspace_path"] - programming_language = json_configuration_data["programming_language"] - output_path_field = json_configuration_data["output_path"] - output_file_name = json_configuration_data["output_file_name"] - codebase_name_field = json_configuration_data["codebase_name"] - github_token = json_configuration_data["api_tokens"]["github_token"] - arcguard_cli_repo = json_configuration_data["repo"]["download_url"] - local_download_directory = json_configuration_data["repo"]["download_directory"] - ai_tokens = json_configuration_data["ai_tokens"] - + app_config = AppConfig(**json_configuration_data) + # Button to submit the indexing start_parsing( - local_workspace_path, - ai_tokens, - # move this when expanding to new languages - programming_language, - output_path_field, - output_file_name, - codebase_name_field, - github_token, - arcguard_cli_repo, - local_download_directory, + app_config, iload_json, iparse_json, isummariser @@ -99,26 +83,26 @@ def ensure_jar_downloaded(github_token,arcguard_cli_repo,local_download_director return jar_path -def start_parsing(local_workspace_path, ai_tokens, programming_language, output_path,output_file_name, codebase_name, github_token, arcguard_cli_repo, local_download_directory, iload_json, iparse_json, isummariser): +def start_parsing(app_config: AppConfig, iload_json: JsonLoader, iparse_json: JsonParser, isummariser: MarkdownSummariser): # Log the start of the parsing process logger.info("Starting parsing process...") # Ensure the JAR is downloaded or use the existing one - jar_path = ensure_jar_downloaded(github_token,arcguard_cli_repo,local_download_directory) + jar_path = ensure_jar_downloaded(app_config.api_tokens["github_token"],app_config.repo.download_url,app_config.repo.download_directory) - logger.info(f"Local Workspace URL: {local_workspace_path}") - logger.info(f"Programming Language: {programming_language}") - logger.info(f"Output Path: {output_path}") - logger.info(f"Codebase Name: {codebase_name}") + logger.info(f"Local Workspace URL: {app_config.local_workspace_path}") + logger.info(f"Programming Language: {app_config.programming_language}") + logger.info(f"Output Path: {app_config.output_path}") + logger.info(f"Codebase Name: {app_config.codebase_name}") # Initialize the ArchGuard handler with the collected parameters. archguard_handler = ArchGuardHandler( jar_path=jar_path, - language=programming_language, - codebase_path=local_workspace_path, - codebase_name=codebase_name, - output_path=output_path + language=app_config.programming_language, + codebase_path=app_config.local_workspace_path, + codebase_name=app_config.codebase_name, + output_path=app_config.output_path ) chapi_metadata_path = archguard_handler.run_scan() @@ -128,7 +112,7 @@ def start_parsing(local_workspace_path, ai_tokens, programming_language, output_ current_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") - output_filename = f"{codebase_name}_{current_timestamp}.md" + output_filename = f"{app_config.codebase_name}_{current_timestamp}.md" unoplat_codebase : UnoplatCodebase = iparse_json.parse_json_to_nodes(chapi_metadata) @@ -144,7 +128,7 @@ def start_parsing(local_workspace_path, ai_tokens, programming_language, output_ dspy_class_pipeline_summary : CodeConfluenceClassModule = CodeConfluenceClassModule() - codebase_summary = CodebaseSummaryParser(unoplat_codebase,dspy_function_pipeline_summary, dspy_class_pipeline_summary,dspy_package_pipeline_summary,dspy_codebase_pipeline_summary,ai_tokens) + codebase_summary = CodebaseSummaryParser(unoplat_codebase,dspy_function_pipeline_summary, dspy_class_pipeline_summary,dspy_package_pipeline_summary,dspy_codebase_pipeline_summary,app_config.llm_provider_config) unoplat_codebase_summary: DspyUnoplatCodebaseSummary = codebase_summary.parse_codebase() @@ -152,7 +136,7 @@ def start_parsing(local_workspace_path, ai_tokens, programming_language, output_ markdown_output = isummariser.summarise_to_markdown(unoplat_codebase_summary) # write the markdown output to a file - with open(os.path.join(output_path, output_filename), 'w') as md_file: + with open(os.path.join(app_config.output_path, output_filename), 'w') as md_file: md_file.write(markdown_output) logger.info("Parsing process completed.") diff --git a/unoplat-code-confluence/unoplat_code_confluence/configuration/__init__.py b/unoplat-code-confluence/unoplat_code_confluence/configuration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/unoplat-code-confluence/unoplat_code_confluence/configuration/external_config.py b/unoplat-code-confluence/unoplat_code_confluence/configuration/external_config.py new file mode 100644 index 0000000..397daa6 --- /dev/null +++ b/unoplat-code-confluence/unoplat_code_confluence/configuration/external_config.py @@ -0,0 +1,60 @@ +from enum import Enum +from typing import Any, Dict +from pydantic import BaseModel, ValidationInfo, field_validator + + +class LLMProvider(Enum): + OPENAI = 'openai' + COHERE = 'cohere' + ANYSCALE = 'anyscale' + TOGETHER = 'together' + + + + +class ProgrammingLanguage(Enum): + # PYTHON = 'Python' + JAVA = 'java' + # JAVASCRIPT = 'JavaScript' + # CSHARP = 'C#' + # RUBY = 'Ruby' + # GO = 'Go' + # KOTLIN = 'Kotlin' + # SWIFT = 'Swift' + + +class RepoConfig(BaseModel): + download_url: str + download_directory: str + +class AppConfig(BaseModel): + local_workspace_path: str + output_path: str + output_file_name: str + codebase_name: str + programming_language: str + repo: RepoConfig + api_tokens: Dict[str, str] + llm_provider_config: Dict[str, Any] + + @field_validator('programming_language') + def check_programming_language(cls, value, info:ValidationInfo): + if value not in [member.value for member in ProgrammingLanguage]: + raise ValueError("programming_language must be a valid programming language") + return value + + + @field_validator('api_tokens') + def check_api_tokens(cls, value, info:ValidationInfo): + if 'github_token' not in value: + raise ValueError("github_token is required in api_tokens") + if len(value) != 1: + raise ValueError("api_tokens must only contain github_token") + return value + + @field_validator('llm_provider_config') + def check_llm_provider_config(cls, value, info:ValidationInfo): + #TODO if key is in LLmProvider enum + if not all(key in LLMProvider for key in value.keys()): + raise ValueError("llm_provider_config keys must be in LLMProvider enum") + return value diff --git a/unoplat-code-confluence/unoplat_code_confluence/dspy_class_summary.py b/unoplat-code-confluence/unoplat_code_confluence/dspy_class_summary.py index 3970a61..fb2d84f 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/dspy_class_summary.py +++ b/unoplat-code-confluence/unoplat_code_confluence/dspy_class_summary.py @@ -28,7 +28,7 @@ def __init__(self): self.generate_class_summary = dspy.ChainOfThought(CodeConfluenceClassSummarySignature) self.generate_class_objective = dspy.ChainOfThought(CodeConfluenceClassObjectiveSignature) - def forward(self, class_metadata: DspyUnoplatNodeSubset, function_objective_summary: List[DspyUnoplatFunctionSummary],llm_config: Dict): + def forward(self, class_metadata: DspyUnoplatNodeSubset, function_objective_summary: List[DspyUnoplatFunctionSummary]): logger.info(f"Generating class summary for {class_metadata.node_name}") class_summary = "" diff --git a/unoplat-code-confluence/unoplat_code_confluence/dspy_codebase_summary.py b/unoplat-code-confluence/unoplat_code_confluence/dspy_codebase_summary.py index 168f6cf..732da0c 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/dspy_codebase_summary.py +++ b/unoplat-code-confluence/unoplat_code_confluence/dspy_codebase_summary.py @@ -24,7 +24,7 @@ def __init__(self): self.generate_codebase_objective = dspy.Predict(CodeConfluenceCodebaseObjectiveSignature) - def forward(self, package_objective_dict: Dict[str, DspyUnoplatPackageNodeSummary],llm_config: Dict): + def forward(self, package_objective_dict: Dict[str, DspyUnoplatPackageNodeSummary]): codebase_summary = "" diff --git a/unoplat-code-confluence/unoplat_code_confluence/dspy_function_summary.py b/unoplat-code-confluence/unoplat_code_confluence/dspy_function_summary.py index 611d52d..02a0193 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/dspy_function_summary.py +++ b/unoplat-code-confluence/unoplat_code_confluence/dspy_function_summary.py @@ -36,7 +36,7 @@ def __init__(self): self.generate_function_summary_with_class_metadata = dspy.ChainOfThought(CodeConfluenceFunctionSummaryWithClassSignature) self.generate_function_objective = dspy.ChainOfThought(CodeConfluenceFunctionObjectiveSignature) - def forward(self, function_metadata: DspyUnoplatFunctionSubset, class_metadata: DspyUnoplatNodeSubset,llm_config: Dict): + def forward(self, function_metadata: DspyUnoplatFunctionSubset, class_metadata: DspyUnoplatNodeSubset): logger.info(f"Generating function summary for {function_metadata.name}") function_summary = "" class_subset = str(class_metadata.model_dump_json()) diff --git a/unoplat-code-confluence/unoplat_code_confluence/dspy_package_summary.py b/unoplat-code-confluence/unoplat_code_confluence/dspy_package_summary.py index 8f45d44..d4b530d 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/dspy_package_summary.py +++ b/unoplat-code-confluence/unoplat_code_confluence/dspy_package_summary.py @@ -24,7 +24,7 @@ def __init__(self): self.generate_package_objective = dspy.ChainOfThought(CodeConfluencePackageObjectiveSignature) - def forward(self, class_objective_list: List[DspyUnoplatNodeSummary],llm_config: Dict): + def forward(self, class_objective_list: List[DspyUnoplatNodeSummary]): package_summary = "" for class_objective in class_objective_list: diff --git a/unoplat-code-confluence/unoplat_code_confluence/summary_parser/codebase_summary.py b/unoplat-code-confluence/unoplat_code_confluence/summary_parser/codebase_summary.py index 53df170..8c980ab 100644 --- a/unoplat-code-confluence/unoplat_code_confluence/summary_parser/codebase_summary.py +++ b/unoplat-code-confluence/unoplat_code_confluence/summary_parser/codebase_summary.py @@ -15,23 +15,28 @@ class CodebaseSummaryParser: - def __init__(self, codebase: UnoplatCodebase, dspy_pipeline_function: CodeConfluenceFunctionModule, dspy_pipeline_class: CodeConfluenceClassModule,dspy_pipeline_package: CodeConfluencePackageModule,dspy_pipeline_codebase: CodeConfluenceCodebaseModule,ai_tokens: dict): + def __init__(self, codebase: UnoplatCodebase, dspy_pipeline_function: CodeConfluenceFunctionModule, dspy_pipeline_class: CodeConfluenceClassModule,dspy_pipeline_package: CodeConfluencePackageModule,dspy_pipeline_codebase: CodeConfluenceCodebaseModule,llm_config: dict): self.codebase = codebase self.dspy_pipeline_function = dspy_pipeline_function self.dspy_pipeline_class = dspy_pipeline_class self.dspy_pipeline_package = dspy_pipeline_package self.dspy_pipeline_codebase = dspy_pipeline_codebase - self.ai_tokens = ai_tokens - #TODO: we will be externalise the different llms that can be used at all dspy pipelines and within dspy pipelines once dspy switches to litellm - self.config = { - "llm_to_codebase_summary": dspy.OpenAI(model='gpt-3.5-turbo-16k',api_key=self.ai_tokens["openai_api_key"]) - } - self.init_dspy_lm() + self.init_dspy_lm(llm_config) - def init_dspy_lm(self): - dspy.configure(lm=self.config["llm_to_codebase_summary"],experimental=True) - + def init_dspy_lm(self,llm_config: dict): + #todo define a switch case + llm_provider = next(iter(llm_config.keys())) + match llm_provider: + case "openai": + dspy.configure(lm=dspy.OpenAI(**llm_config["openai"]),experimental=True) + case "together": + dspy.configure(lm=dspy.Together(**llm_config["together"]),experimental=True) + case "anyscale": + dspy.configure(lm=dspy.Anyscale(**llm_config["anyscale"]),experimental=True) + case "anthropic": + dspy.configure(lm=dspy.Anthropic(**llm_config["anthropic"]),experimental=True) + def parse_codebase(self) -> DspyUnoplatCodebaseSummary: @@ -46,21 +51,21 @@ def parse_codebase(self) -> DspyUnoplatCodebaseSummary: function_summaries :List[DspyUnoplatFunctionSummary] = [] for function in node.functions: - function_summary = self.dspy_pipeline_function(function_metadata=function,class_metadata=node,llm_config=self.config).answer + function_summary = self.dspy_pipeline_function(function_metadata=function,class_metadata=node).answer dspyUnoplatFunctionSummary: DspyUnoplatFunctionSummary = DspyUnoplatFunctionSummary(FunctionName=function.name,FunctionSummary=function_summary) function_summaries.append(dspyUnoplatFunctionSummary) - class_summary: DspyUnoplatNodeSummary = self.dspy_pipeline_class(class_metadata=node, function_objective_summary=function_summaries,llm_config=self.config).answer + class_summary: DspyUnoplatNodeSummary = self.dspy_pipeline_class(class_metadata=node, function_objective_summary=function_summaries).answer class_summaries.append(class_summary) - dspy_pipeline_package_node_summary: DspyUnoplatPackageNodeSummary = self.dspy_pipeline_package(class_summaries,llm_config=self.config).answer + dspy_pipeline_package_node_summary: DspyUnoplatPackageNodeSummary = self.dspy_pipeline_package(class_summaries).answer logger.info(f"Generating package summary for {package_name}") unoplat_package_summary.package_summary_dict[package_name] = dspy_pipeline_package_node_summary # Extract list of DspyUnoplatPackageNodeSummary from unoplat_package_summary # Pass the list of DspyUnoplatPackageNodeSummary to dspy_pipeline_codebase - dspy_codebase_summary = self.dspy_pipeline_codebase(package_objective_dict=unoplat_package_summary.package_summary_dict,llm_config=self.config) + dspy_codebase_summary = self.dspy_pipeline_codebase(package_objective_dict=unoplat_package_summary.package_summary_dict) unoplat_codebase_summary.codebase_summary = dspy_codebase_summary.summary unoplat_codebase_summary.codebase_objective = dspy_codebase_summary.answer