Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: externalised llm provider config and enabled support for anthro… #58

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,4 @@ unoplat-code-confluence/unoplat_code_confluence/utility/__pycache__
unoplat-code-confluence/unoplat_code_confluence/example_config.json
unoplat-code-confluence/poetry.lock
app.log
unoplat-code-confluence/unoplat_code_confluence/configuration/__pycache__
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,19 @@ Local workspace on your computer from https://github.com/DataStax-Examples/sprin
"api_tokens": {
"github_token": "your github pat for downloading arcguard"
},
"ai_tokens": {
"openai_api_key": "your openai api key"
"llm_provider_config": {
"openai": {
"api_key": "YourApiKey",
"model": "gpt-3.5-turbo-16k",
"model_type" : "chat",
"max_tokens": 1024,
"temperature": 0.0
}
}
```
Configuration Note: Do not change the download_url and keep the programming_language to java (as right now only java is supported)
llm Provider Config:
- Model Providers Supported: ["openai","together","anyscale","anthropic"] For config inside llm_provider_config refer - [Dspy Model Provider Doc](https://dspy-docs.vercel.app/docs/category/remote-language-model-clients)

Note: we have tried gpt3.5 turbo and it works well as data is precise for code understanding. Our experience with https://huggingface.co/01-ai/Yi-1.5-34B-Chat also has been great apart from hiccups with last level when codebase understand is being formed. Also this will get much better as currently all the dspy modules are uncompiled.We will be rolling out evaluated models and results post optimisation soon. Until then users can use 3.5turbo for decent results.

Expand Down
50 changes: 17 additions & 33 deletions unoplat-code-confluence/unoplat_code_confluence/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import datetime
from unoplat_code_confluence.codebaseparser.arc_guard_handler import ArchGuardHandler
import re
from unoplat_code_confluence.configuration.external_config import AppConfig
from unoplat_code_confluence.data_models.chapi_unoplat_codebase import UnoplatCodebase
from unoplat_code_confluence.data_models.dspy.dspy_unoplat_codebase_summary import DspyUnoplatCodebaseSummary
from unoplat_code_confluence.downloader.downloader import Downloader
Expand Down Expand Up @@ -40,29 +41,12 @@ def start_pipeline():

def get_codebase_metadata(json_configuration_data,iload_json,iparse_json,isummariser):
# Collect necessary inputs from the user to set up the codebase indexing
local_workspace_path = json_configuration_data["local_workspace_path"]
programming_language = json_configuration_data["programming_language"]
output_path_field = json_configuration_data["output_path"]
output_file_name = json_configuration_data["output_file_name"]
codebase_name_field = json_configuration_data["codebase_name"]
github_token = json_configuration_data["api_tokens"]["github_token"]
arcguard_cli_repo = json_configuration_data["repo"]["download_url"]
local_download_directory = json_configuration_data["repo"]["download_directory"]
ai_tokens = json_configuration_data["ai_tokens"]

app_config = AppConfig(**json_configuration_data)


# Button to submit the indexing
start_parsing(
local_workspace_path,
ai_tokens,
# move this when expanding to new languages
programming_language,
output_path_field,
output_file_name,
codebase_name_field,
github_token,
arcguard_cli_repo,
local_download_directory,
app_config,
iload_json,
iparse_json,
isummariser
Expand Down Expand Up @@ -99,26 +83,26 @@ def ensure_jar_downloaded(github_token,arcguard_cli_repo,local_download_director
return jar_path


def start_parsing(local_workspace_path, ai_tokens, programming_language, output_path,output_file_name, codebase_name, github_token, arcguard_cli_repo, local_download_directory, iload_json, iparse_json, isummariser):
def start_parsing(app_config: AppConfig, iload_json: JsonLoader, iparse_json: JsonParser, isummariser: MarkdownSummariser):

# Log the start of the parsing process
logger.info("Starting parsing process...")

# Ensure the JAR is downloaded or use the existing one
jar_path = ensure_jar_downloaded(github_token,arcguard_cli_repo,local_download_directory)
jar_path = ensure_jar_downloaded(app_config.api_tokens["github_token"],app_config.repo.download_url,app_config.repo.download_directory)

logger.info(f"Local Workspace URL: {local_workspace_path}")
logger.info(f"Programming Language: {programming_language}")
logger.info(f"Output Path: {output_path}")
logger.info(f"Codebase Name: {codebase_name}")
logger.info(f"Local Workspace URL: {app_config.local_workspace_path}")
logger.info(f"Programming Language: {app_config.programming_language}")
logger.info(f"Output Path: {app_config.output_path}")
logger.info(f"Codebase Name: {app_config.codebase_name}")

# Initialize the ArchGuard handler with the collected parameters.
archguard_handler = ArchGuardHandler(
jar_path=jar_path,
language=programming_language,
codebase_path=local_workspace_path,
codebase_name=codebase_name,
output_path=output_path
language=app_config.programming_language,
codebase_path=app_config.local_workspace_path,
codebase_name=app_config.codebase_name,
output_path=app_config.output_path
)

chapi_metadata_path = archguard_handler.run_scan()
Expand All @@ -128,7 +112,7 @@ def start_parsing(local_workspace_path, ai_tokens, programming_language, output_

current_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

output_filename = f"{codebase_name}_{current_timestamp}.md"
output_filename = f"{app_config.codebase_name}_{current_timestamp}.md"

unoplat_codebase : UnoplatCodebase = iparse_json.parse_json_to_nodes(chapi_metadata)

Expand All @@ -144,15 +128,15 @@ def start_parsing(local_workspace_path, ai_tokens, programming_language, output_

dspy_class_pipeline_summary : CodeConfluenceClassModule = CodeConfluenceClassModule()

codebase_summary = CodebaseSummaryParser(unoplat_codebase,dspy_function_pipeline_summary, dspy_class_pipeline_summary,dspy_package_pipeline_summary,dspy_codebase_pipeline_summary,ai_tokens)
codebase_summary = CodebaseSummaryParser(unoplat_codebase,dspy_function_pipeline_summary, dspy_class_pipeline_summary,dspy_package_pipeline_summary,dspy_codebase_pipeline_summary,app_config.llm_provider_config)

unoplat_codebase_summary: DspyUnoplatCodebaseSummary = codebase_summary.parse_codebase()

# now write to a markdown dspy unoplat codebase summary

markdown_output = isummariser.summarise_to_markdown(unoplat_codebase_summary)
# write the markdown output to a file
with open(os.path.join(output_path, output_filename), 'w') as md_file:
with open(os.path.join(app_config.output_path, output_filename), 'w') as md_file:
md_file.write(markdown_output)

logger.info("Parsing process completed.")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from enum import Enum
from typing import Any, Dict
from pydantic import BaseModel, ValidationInfo, field_validator


class LLMProvider(Enum):
OPENAI = 'openai'
COHERE = 'cohere'
ANYSCALE = 'anyscale'
TOGETHER = 'together'




class ProgrammingLanguage(Enum):
# PYTHON = 'Python'
JAVA = 'java'
# JAVASCRIPT = 'JavaScript'
# CSHARP = 'C#'
# RUBY = 'Ruby'
# GO = 'Go'
# KOTLIN = 'Kotlin'
# SWIFT = 'Swift'


class RepoConfig(BaseModel):
download_url: str
download_directory: str

class AppConfig(BaseModel):
local_workspace_path: str
output_path: str
output_file_name: str
codebase_name: str
programming_language: str
repo: RepoConfig
api_tokens: Dict[str, str]
llm_provider_config: Dict[str, Any]

@field_validator('programming_language')
def check_programming_language(cls, value, info:ValidationInfo):
if value not in [member.value for member in ProgrammingLanguage]:
raise ValueError("programming_language must be a valid programming language")
return value


@field_validator('api_tokens')
def check_api_tokens(cls, value, info:ValidationInfo):
if 'github_token' not in value:
raise ValueError("github_token is required in api_tokens")
if len(value) != 1:
raise ValueError("api_tokens must only contain github_token")
return value

@field_validator('llm_provider_config')
def check_llm_provider_config(cls, value, info:ValidationInfo):
#TODO if key is in LLmProvider enum
if not all(key in LLMProvider for key in value.keys()):
raise ValueError("llm_provider_config keys must be in LLMProvider enum")
return value
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(self):
self.generate_class_summary = dspy.ChainOfThought(CodeConfluenceClassSummarySignature)
self.generate_class_objective = dspy.ChainOfThought(CodeConfluenceClassObjectiveSignature)

def forward(self, class_metadata: DspyUnoplatNodeSubset, function_objective_summary: List[DspyUnoplatFunctionSummary],llm_config: Dict):
def forward(self, class_metadata: DspyUnoplatNodeSubset, function_objective_summary: List[DspyUnoplatFunctionSummary]):
logger.info(f"Generating class summary for {class_metadata.node_name}")
class_summary = ""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(self):
self.generate_codebase_objective = dspy.Predict(CodeConfluenceCodebaseObjectiveSignature)


def forward(self, package_objective_dict: Dict[str, DspyUnoplatPackageNodeSummary],llm_config: Dict):
def forward(self, package_objective_dict: Dict[str, DspyUnoplatPackageNodeSummary]):

codebase_summary = ""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__(self):
self.generate_function_summary_with_class_metadata = dspy.ChainOfThought(CodeConfluenceFunctionSummaryWithClassSignature)
self.generate_function_objective = dspy.ChainOfThought(CodeConfluenceFunctionObjectiveSignature)

def forward(self, function_metadata: DspyUnoplatFunctionSubset, class_metadata: DspyUnoplatNodeSubset,llm_config: Dict):
def forward(self, function_metadata: DspyUnoplatFunctionSubset, class_metadata: DspyUnoplatNodeSubset):
logger.info(f"Generating function summary for {function_metadata.name}")
function_summary = ""
class_subset = str(class_metadata.model_dump_json())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(self):
self.generate_package_objective = dspy.ChainOfThought(CodeConfluencePackageObjectiveSignature)


def forward(self, class_objective_list: List[DspyUnoplatNodeSummary],llm_config: Dict):
def forward(self, class_objective_list: List[DspyUnoplatNodeSummary]):

package_summary = ""
for class_objective in class_objective_list:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,28 @@

class CodebaseSummaryParser:

def __init__(self, codebase: UnoplatCodebase, dspy_pipeline_function: CodeConfluenceFunctionModule, dspy_pipeline_class: CodeConfluenceClassModule,dspy_pipeline_package: CodeConfluencePackageModule,dspy_pipeline_codebase: CodeConfluenceCodebaseModule,ai_tokens: dict):
def __init__(self, codebase: UnoplatCodebase, dspy_pipeline_function: CodeConfluenceFunctionModule, dspy_pipeline_class: CodeConfluenceClassModule,dspy_pipeline_package: CodeConfluencePackageModule,dspy_pipeline_codebase: CodeConfluenceCodebaseModule,llm_config: dict):
self.codebase = codebase
self.dspy_pipeline_function = dspy_pipeline_function
self.dspy_pipeline_class = dspy_pipeline_class
self.dspy_pipeline_package = dspy_pipeline_package
self.dspy_pipeline_codebase = dspy_pipeline_codebase
self.ai_tokens = ai_tokens

#TODO: we will be externalise the different llms that can be used at all dspy pipelines and within dspy pipelines once dspy switches to litellm
self.config = {
"llm_to_codebase_summary": dspy.OpenAI(model='gpt-3.5-turbo-16k',api_key=self.ai_tokens["openai_api_key"])
}
self.init_dspy_lm()
self.init_dspy_lm(llm_config)

def init_dspy_lm(self):
dspy.configure(lm=self.config["llm_to_codebase_summary"],experimental=True)

def init_dspy_lm(self,llm_config: dict):
#todo define a switch case
llm_provider = next(iter(llm_config.keys()))
match llm_provider:
case "openai":
dspy.configure(lm=dspy.OpenAI(**llm_config["openai"]),experimental=True)
case "together":
dspy.configure(lm=dspy.Together(**llm_config["together"]),experimental=True)
case "anyscale":
dspy.configure(lm=dspy.Anyscale(**llm_config["anyscale"]),experimental=True)
case "anthropic":
dspy.configure(lm=dspy.Anthropic(**llm_config["anthropic"]),experimental=True)



def parse_codebase(self) -> DspyUnoplatCodebaseSummary:
Expand All @@ -46,21 +51,21 @@ def parse_codebase(self) -> DspyUnoplatCodebaseSummary:
function_summaries :List[DspyUnoplatFunctionSummary] = []

for function in node.functions:
function_summary = self.dspy_pipeline_function(function_metadata=function,class_metadata=node,llm_config=self.config).answer
function_summary = self.dspy_pipeline_function(function_metadata=function,class_metadata=node).answer
dspyUnoplatFunctionSummary: DspyUnoplatFunctionSummary = DspyUnoplatFunctionSummary(FunctionName=function.name,FunctionSummary=function_summary)
function_summaries.append(dspyUnoplatFunctionSummary)

class_summary: DspyUnoplatNodeSummary = self.dspy_pipeline_class(class_metadata=node, function_objective_summary=function_summaries,llm_config=self.config).answer
class_summary: DspyUnoplatNodeSummary = self.dspy_pipeline_class(class_metadata=node, function_objective_summary=function_summaries).answer
class_summaries.append(class_summary)

dspy_pipeline_package_node_summary: DspyUnoplatPackageNodeSummary = self.dspy_pipeline_package(class_summaries,llm_config=self.config).answer
dspy_pipeline_package_node_summary: DspyUnoplatPackageNodeSummary = self.dspy_pipeline_package(class_summaries).answer
logger.info(f"Generating package summary for {package_name}")
unoplat_package_summary.package_summary_dict[package_name] = dspy_pipeline_package_node_summary

# Extract list of DspyUnoplatPackageNodeSummary from unoplat_package_summary
# Pass the list of DspyUnoplatPackageNodeSummary to dspy_pipeline_codebase

dspy_codebase_summary = self.dspy_pipeline_codebase(package_objective_dict=unoplat_package_summary.package_summary_dict,llm_config=self.config)
dspy_codebase_summary = self.dspy_pipeline_codebase(package_objective_dict=unoplat_package_summary.package_summary_dict)

unoplat_codebase_summary.codebase_summary = dspy_codebase_summary.summary
unoplat_codebase_summary.codebase_objective = dspy_codebase_summary.answer
Expand Down
Loading