AppThreat · prabhu · Oct 3, 2023 · Sep 29, 2023 · Oct 3, 2023 · Oct 3, 2023
diff --git a/.github/workflows/containers.yml b/.github/workflows/containers.yml
@@ -57,15 +57,17 @@ jobs:
         run: |
           python3.11 -m pip install --upgrade pip
           python3.11 -m pip install poetry
-          python3.11 -m poetry export -f requirements.txt --with=science --output target/chen-science-requirements.txt
+          python3.11 -m poetry export -f requirements.txt --with=science --without-hashes --output target/chen-science-requirements.txt
+          python3.11 -m poetry export -f requirements.txt --with=database --without-hashes --output target/chen-database-requirements.txt
       - name: Upload chen to ghcr
         run: |
           cd target
           echo $GITHUB_TOKEN | oras login ghcr.io -u $GITHUB_USERNAME --password-stdin
           oras push ghcr.io/$IMAGE_NAME:v1 \
             --annotation-file ../ci/annotations.json \
             ./chen.zip:application/vnd.appthreat.chen.layer.v1+tar \
-            ./chen-science-requirements.txt:application/vnd.appthreat.chen.layer.v1+tar
+            ./chen-science-requirements.txt:application/vnd.appthreat.chen.layer.v1+tar \
+            ./chen-database-requirements.txt:application/vnd.appthreat.chen.layer.v1+tar
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           GITHUB_USERNAME: ${{ github.actor }}

diff --git a/.gitignore b/.gitignore
@@ -52,4 +52,6 @@ flake.lock
 chen.zip
 .metals/
 *.pyc
-**/__pycache__/
+**/__pycache__/
+.vscode/
+project/metals.sbt
diff --git a/README.md b/README.md
@@ -11,6 +11,10 @@ Code Hierarchy Exploration Net (chen) is an advanced exploration toolkit for you
 - Node.js > 16 (To run [atom](https://github.com/AppThreat/atom))
 - Minimum 16GB RAM
 
+### Additional requirements
+
+- Rust (For rocksdb-py compilation)
+
 ## Installation
 
 ```shell
@@ -27,6 +31,12 @@ To download the chen distribution including the science pack.
 chen --download
 ```
 
+To generate custom graphs and models with atom for data science, download the scientific pack which installs support for PyTorch ecosystem.
+
+```shell
+chen --download --with-science
+```
+
 Once the download finishes, the command will display the download location along with the environment variables that need to be set to invoke `chennai` console. Example output below:
 
 ```shell

diff --git a/build.sbt b/build.sbt
@@ -1,6 +1,6 @@
 name                     := "chen"
 ThisBuild / organization := "io.appthreat"
-ThisBuild / version      := "0.0.9"
+ThisBuild / version      := "0.0.10"
 ThisBuild / scalaVersion := "3.3.1"
 
 val cpgVersion = "1.4.22"

diff --git a/chenpy/cli.py b/chenpy/cli.py
@@ -34,6 +34,13 @@ def build_args():
         help="Download the latest chen distribution in platform specific "
         "user_data_dir",
     )
+    parser.add_argument(
+        "--with-science",
+        action="store_true",
+        default=False,
+        dest="science_pack",
+        help="Download the science pack",
+    )
     parser.add_argument(
         "--server",
         action="store_true",
@@ -104,7 +111,7 @@ def fix_envs():
         )
 
 
-def download_chen_distribution(overwrite=False):
+def download_chen_distribution(overwrite=False, science_pack=False):
     if os.path.exists(os.path.join(config.chen_home, "platform")):
         if not overwrite:
             fix_envs()
@@ -150,16 +157,18 @@ def download_chen_distribution(overwrite=False):
                         pass
         # Install the science pack
         if req_files:
-            install_science_modules()
+            install_py_modules("database")
+            if science_pack:
+                install_py_modules("science")
         fix_envs()
 
 
-def install_science_modules():
+def install_py_modules(pack="database"):
     """
     Install the required science modules
     """
     LOG.debug("About to install the science pack using cpu-only configuration")
-    req_file = os.path.join(config.chen_home, "chen-science-requirements.txt")
+    req_file = os.path.join(config.chen_home, f"chen-{pack}-requirements.txt")
     if os.path.exists(req_file):
         subprocess.check_call(
             [sys.executable, "-m", "pip", "install", "-r", req_file],
@@ -174,7 +183,7 @@ def main():
     and generates reports based on the results.
     """
     args = build_args()
-    download_chen_distribution(args.download)
+    download_chen_distribution(args.download, args.science_pack)
 
 
 if __name__ == "__main__":

diff --git a/chenpy/db.py b/chenpy/db.py
@@ -0,0 +1,16 @@
+import rocksdbpy
+
+
+def db_options():
+    opts = rocksdbpy.Option()
+    opts.create_if_missing(True)
+    opts.set_max_open_files(10)
+    opts.set_use_fsync(True)
+    opts.set_bytes_per_sync(1024 * 1024)
+    opts.optimize_for_point_lookup(1024 * 1024)
+    opts.set_bloom_locality(16)
+    return opts
+
+
+def get(path):
+    return rocksdbpy.open(path, db_options())
diff --git a/chenpy/graph.py b/chenpy/graph.py
@@ -7,14 +7,25 @@
 
 from chenpy.utils import calculate_hash
 
+DATABASE_PACK_AVAILABLE = True
 SCIENCE_PACK_AVAILABLE = True
 try:
     import networkx as nx
+    from networkx.readwrite import json_graph, read_graphml
+except ImportError:
+    DATABASE_PACK_AVAILABLE = False
+
+try:
     import pydotplus
     import torch
-    from networkx.readwrite import json_graph, read_graphml
     from torch import Tensor
     from torch_geometric.data import Data
+    from torchtext.data.functional import (
+        generate_sp_model,
+        load_sp_model,
+        sentencepiece_numericalizer,
+        sentencepiece_tokenizer,
+    )
 except ImportError:
     SCIENCE_PACK_AVAILABLE = False
 
@@ -226,26 +237,30 @@ def node_match_fn(n1, n2):
 
 
 def gep(first_graph, second_graph, upper_bound=500):
-    """Function to compute the difference based on optimal edit path algorithm"""
-    return nx.optimal_edit_paths(
+    distance = nx.optimal_edit_paths(
         first_graph,
         second_graph,
         node_match=node_match_fn,
         edge_match=node_match_fn,
         upper_bound=upper_bound,
     )
+    if distance is None:
+        distance = -1
+    return distance
 
 
 def ged(first_graph, second_graph, timeout=5, upper_bound=500):
-    """Function to compute the difference based on graph edit distance algorithm"""
-    return nx.graph_edit_distance(
+    distance = nx.graph_edit_distance(
         first_graph,
         second_graph,
         node_match=node_match_fn,
         edge_match=node_match_fn,
         timeout=timeout,
         upper_bound=upper_bound,
     )
+    if distance is None:
+        distance = -1
+    return distance
 
 
 def write_dot(G, path):
@@ -296,22 +311,23 @@ def summarize(G, as_dict=False, as_dot=False):
     return summary_graph
 
 
-def is_similar(M1, M2, upper_bound=500, timeout=5):
-    """Function to check if two graphs are similar. To simplify the problem, first the raw graph difference is computed to check if the graphs are the same.
-    If not graph edit distance is computed with a fixed timeout to help answer the question
-    """
+def is_similar(M1, M2, edit_distance=10, upper_bound=500, timeout=5):
     if not diff_graph(M1, M2, as_dict=True):
         return True
     distance = ged(M1, M2, upper_bound=upper_bound, timeout=timeout)
-    if distance is None:
+    if distance == -1:
         return False
-    return True
+    return int(distance) < edit_distance
 
 
 def convert_graphml(
     gml_file, force_multigraph=False, as_graph=True, as_adjacency_data=False
 ):
     """Function to convert graphml to networkx"""
+    if not DATABASE_PACK_AVAILABLE:
+        return RuntimeError(
+            "Graph database dependencies missing. Please refer to the documentation to install the database pack or use the official chen container image."
+        )
     try:
         G = read_graphml(gml_file, force_multigraph=force_multigraph)
         if as_graph:

diff --git a/chenpy/logger.py b/chenpy/logger.py
@@ -17,17 +17,43 @@
 import os
 
 from rich.console import Console
+from rich.highlighter import RegexHighlighter
 from rich.logging import RichHandler
 from rich.theme import Theme
 
-custom_theme = Theme({"info": "#5A7C90", "warning": "#FF753D", "danger": "bold red"})
+
+class CustomHighlighter(RegexHighlighter):
+    base_style = "atom."
+    highlights = [
+        r"(?P<method>([\w-]+\.)+[\w-]+[^<>:(),]?)",
+        r"(?P<path>(\w+\/.*\.[\w:]+))",
+        r"(?P<params>[(]([\w,-]+\.)+?[\w-]+[)]$)",
+        r"(?P<opers>(unresolvedNamespace|unresolvedSignature|init|operators|operator|clinit))",
+    ]
+
+
+custom_theme = Theme(
+    {
+        "atom.path": "#7c8082",
+        "atom.params": "#5a7c90",
+        "atom.opers": "#7c8082",
+        "atom.method": "#FF753D",
+        "info": "#5A7C90",
+        "warning": "#FF753D",
+        "danger": "bold red",
+    }
+)
+
+
 console = Console(
     log_time=False,
     log_path=False,
     theme=custom_theme,
     width=int(os.getenv("COLUMNS", "270")),
     color_system="256",
     force_terminal=True,
+    highlight=True,
+    highlighter=CustomHighlighter(),
     record=True,
 )
 

diff --git a/chenpy/source/__init__.py b/chenpy/source/__init__.py
diff --git a/chenpy/source/ghsa.py b/chenpy/source/ghsa.py
@@ -0,0 +1,122 @@
+import os
+
+import httpx
+
+# GitHub advisory feed url
+ghsa_api_url = os.getenv("GITHUB_GRAPHQL_URL", "https://api.github.com/graphql")
+api_token = os.getenv("GITHUB_TOKEN")
+headers = {"Authorization": f"token {api_token}"}
+
+ecosystem_type_dict = {
+    "go": "golang",
+    "rust": "cargo",
+    "pip": "pypi",
+    "rubygems": "gem",
+}
+
+
+def get_query(cve_or_ghsa=None, only_malware=False, extra_clause=None):
+    """Method to construct the graphql query"""
+    extra_args = ""
+    if not cve_or_ghsa:
+        extra_args = "first: 100"
+    else:
+        id_type = "GHSA" if cve_or_ghsa.startswith("GHSA") else "CVE"
+        extra_args = (
+            'first: 100, identifier: {type: %(id_type)s, value: "%(cve_or_ghsa)s"}'
+            % dict(id_type=id_type, cve_or_ghsa=cve_or_ghsa)
+        )
+    if only_malware:
+        extra_args = f"{extra_args}, classifications:MALWARE"
+    if extra_clause:
+        extra_args = f"{extra_args}, {extra_clause}"
+    gqljson = {
+        "query": """
+            {
+                securityAdvisories(
+                    %(extra_args)s
+                ) {
+                    nodes {
+                    id
+                    ghsaId
+                    summary
+                    description
+                    identifiers {
+                        type
+                        value
+                    }
+                    origin
+                    publishedAt
+                    updatedAt
+                    references {
+                        url
+                    }
+                    severity
+                    withdrawnAt
+                    vulnerabilities(first: 10) {
+                        nodes {
+                        firstPatchedVersion {
+                            identifier
+                        }
+                        package {
+                            ecosystem
+                            name
+                        }
+                        severity
+                        updatedAt
+                        vulnerableVersionRange
+                        }
+                    }
+                    }
+                }
+            }
+        """
+        % dict(extra_args=extra_args)
+    }
+    return gqljson
+
+
+def parse_response(json_data):
+    """Parse json response and convert to list of purls"""
+    purl_list = []
+    for node in (
+        json_data.get("data", {}).get("securityAdvisories", {}).get("nodes", {})
+    ):
+        ghsa_id = node.get("ghsaId")
+        vulnerable_nodes = node.get("vulnerabilities", {}).get("nodes", [])
+        for vn in vulnerable_nodes:
+            pkg = vn.get("package", {})
+            version = ""
+            if vn.get("firstPatchedVersion"):
+                version = vn.get("firstPatchedVersion", {}).get("identifier", "")
+            elif vn.get("vulnerableVersionRange"):
+                version = vn.get("vulnerableVersionRange").split(" ")[-1]
+            if pkg:
+                ptype = pkg.get("ecosystem", "").lower()
+                pname = pkg.get("name", "").lower().replace(":", "/")
+                # This is the fixed version
+                if ptype and pname and version:
+                    purl = (
+                        f"pkg:{ecosystem_type_dict.get(ptype, ptype)}/{pname}@{version}"
+                    )
+                    purl_list.append(
+                        {
+                            "ghsaId": ghsa_id,
+                            "purl": purl,
+                        }
+                    )
+    return purl_list
+
+
+def get_download_urls(cve_or_ghsa=None, only_malware=False):
+    """Method to get download urls for the packages belonging to the CVE"""
+    if not api_token:
+        raise ValueError("GITHUB_TOKEN is required with read:packages scope")
+    client = httpx.Client(http2=True, follow_redirects=True, timeout=180)
+    r = client.post(
+        url=ghsa_api_url,
+        json=get_query(cve_or_ghsa=cve_or_ghsa, only_malware=only_malware),
+        headers=headers,
+    )
+    json_data = r.json()
+    return parse_response(json_data)