diff --git a/README.md b/README.md index 8394139..c8a7a35 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,8 @@ Multiple upstream sources are used by vdb to improve accuracy and reduce false n - NVD - GitHub -1 - We exclude Linux and oss-fuzz feeds by default. Set the environment variable `OSV_INCLUDE_FUZZ` to include them. +1 - We exclude Linux and oss-fuzz feeds by default. Set the environment variable `OSV_INCLUDE_FUZZ=true` to include them. +2 - Malware feeds are included by default, thus increasing the db size slightly. Set the environment variable `OSV_EXCLUDE_MALWARE=true` to exclude them. ## Linux distros diff --git a/pyproject.toml b/pyproject.toml index 58de7e8..65d1dab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "appthreat-vulnerability-db" -version = "6.1.0" +version = "6.1.1" description = "AppThreat's vulnerability database and package search library with a built-in sqlite based storage. OSV, CVE, GitHub, npm are the primary sources of vulnerabilities." authors = [ {name = "Team AppThreat", email = "cloud@appthreat.com"}, diff --git a/vdb/lib/osv.py b/vdb/lib/osv.py index 14ecf28..e4f196f 100644 --- a/vdb/lib/osv.py +++ b/vdb/lib/osv.py @@ -3,7 +3,7 @@ This module fetches the vulnerability data from osv.dev and stores them in NVD CVE 1.1 json format. """ - +from os import getenv from zipfile import ZipFile import httpx @@ -15,7 +15,6 @@ from vdb.lib.utils import ( compress_str, convert_score_severity, - extract_affected_symbols, get_cvss3_from_vector, get_default_cve_data, parse_purl, @@ -55,7 +54,7 @@ def download_all(self): # For performance do not retain the whole data in-memory # See: https://github.com/AppThreat/vulnerability-db/issues/27 data_list = [] - for _, url in config.osv_url_dict.items(): + for _, url in config.OSV_URL_DICT.items(): data = self.fetch(url) if data: self.store(data) @@ -121,6 +120,9 @@ def to_vuln(cve_data): references = orjson.dumps(references, option=orjson.OPT_NAIVE_UTC) if isinstance(references, bytes): references = references.decode("utf-8", "ignore") + # Offer an option to ignore malware data to keep the db size small + if getenv("OSV_EXCLUDE_MALWARE") and cve_id.startswith("MAL"): + return ret_data # Quality of PYSEC data is quite low missing both severity and score # Where a PYSEC feed also reference a github id, let's ignore it since G comes before P # so it would have gotten processed