- {result.name}
+ {truncateText(result.name, 20)}
{result.similarity.toFixed(3)}
@@ -78,7 +84,7 @@ const SearchResultsTable: React.FC = ({
{result.weekly_downloads.toLocaleString()}
-
+
{result.summary}
diff --git a/frontend/app/components/SupportButton.tsx b/frontend/app/components/SupportButton.tsx
new file mode 100644
index 0000000..a55a2fb
--- /dev/null
+++ b/frontend/app/components/SupportButton.tsx
@@ -0,0 +1,23 @@
+import React from "react";
+
+const SupportButton: React.FC = () => {
+ return (
+
+
+ Support
+
+ );
+};
+
+export default SupportButton;
diff --git a/frontend/app/globals.css b/frontend/app/globals.css
index 3216bc2..7c5c7b4 100644
--- a/frontend/app/globals.css
+++ b/frontend/app/globals.css
@@ -6,8 +6,8 @@
--foreground-rgb: 0, 0, 0;
--background-start-rgb: 214, 219, 220;
--background-end-rgb: 255, 255, 255;
- --dark-bg-start-rgb: 10, 10, 35; /* Very dark blue almost grey */
- --dark-bg-end-rgb: 25, 25, 50; /* Dark blue */
+ --dark-bg-start-rgb: 17, 24, 39; /* Dark gray (bg-gray-900) */
+ --dark-bg-end-rgb: 17, 24, 39; /* Dark gray (bg-gray-900) */
--dark-foreground-rgb: 255, 255, 255;
}
@@ -21,12 +21,7 @@
body {
color: rgb(var(--foreground-rgb));
- background: linear-gradient(
- to bottom,
- transparent,
- rgb(var(--background-end-rgb))
- )
- rgb(var(--background-start-rgb));
+ background: rgb(var(--background-start-rgb)); /* Solid background color */
}
@layer utilities {
diff --git a/frontend/app/page.tsx b/frontend/app/page.tsx
index c3ce2cc..b539365 100644
--- a/frontend/app/page.tsx
+++ b/frontend/app/page.tsx
@@ -5,6 +5,8 @@ import { handleSearch, sortResults } from "./utils/search";
import SearchResultsTable from "./components/SearchResultsTable";
import InfoBox from "./components/InfoBox";
import { ClipLoader } from "react-spinners";
+import GitHubButton from "./components/GitHubButton";
+import SupportButton from "./components/SupportButton";
interface Match {
name: string;
@@ -31,20 +33,29 @@ export default function Home() {
};
return (
-
-
+
+
+
+
Enter your query to search for Python packages
-
+
+
handleSearch(
text,
@@ -75,7 +86,7 @@ export default function Home() {
setInfoBoxVisible(!infoBoxVisible)}
>
{infoBoxVisible ? "Hide Info" : "How does this work?"}
diff --git a/frontend/app/utils/search.ts b/frontend/app/utils/search.ts
index 56c652a..88bfb28 100644
--- a/frontend/app/utils/search.ts
+++ b/frontend/app/utils/search.ts
@@ -7,6 +7,14 @@ interface Match {
summary: string;
}
+interface SearchResponse {
+ matches: Match[];
+ warning?: boolean;
+ warning_message?: string;
+}
+
+const apiUrl = process.env.NEXT_PUBLIC_API_URL;
+
export const handleSearch = async (
query: string,
sortField: string,
@@ -18,8 +26,8 @@ export const handleSearch = async (
setLoading(true);
setError("");
try {
- const response = await axios.post(
- "http://localhost:8000/search",
+ const response = await axios.post(
+ `${apiUrl}/search`,
{
query: query,
},
@@ -29,10 +37,20 @@ export const handleSearch = async (
},
},
);
- const fetchedResults: Match[] = response.data.matches;
- setResults(sortResults(fetchedResults, sortField, sortDirection));
+
+ const { matches, warning, warning_message } = response.data;
+
+ if (warning && warning_message) {
+ console.warn("Warning from API:", warning_message);
+ }
+
+ setResults(sortResults(matches, sortField, sortDirection));
} catch (error) {
- setError("Error fetching search results.");
+ if (axios.isAxiosError(error) && error.response?.status === 429) {
+ setError("Rate limit reached. Please wait a minute and try again.");
+ } else {
+ setError("Error fetching search results.");
+ }
console.error("Error fetching search results:", error);
} finally {
setLoading(false);
@@ -46,8 +64,9 @@ export const sortResults = (
): Match[] => {
return [...data].sort((a, b) => {
// @ts-ignore
- if (a[field] < b[field]) return direction === "asc" ? -1 : 1; // @ts-ignore
- if (a[field] > b[field]) return direction === "asc" ? 1 : -1; // @ts-ignore
+ if (a[field] < b[field]) return direction === "asc" ? -1 : 1;
+ // @ts-ignore
+ if (a[field] > b[field]) return direction === "asc" ? 1 : -1;
return 0;
});
};
diff --git a/frontend/next.config.mjs b/frontend/next.config.mjs
index 4678774..ee2db8a 100644
--- a/frontend/next.config.mjs
+++ b/frontend/next.config.mjs
@@ -1,4 +1,9 @@
/** @type {import('next').NextConfig} */
-const nextConfig = {};
+const nextConfig = {
+ env: {
+ NEXT_PUBLIC_API_URL:
+ process.env.NEXT_PUBLIC_API_URL || "http://localhost:8000/api",
+ },
+};
export default nextConfig;
diff --git a/frontend/public/kofi.png b/frontend/public/kofi.png
new file mode 100644
index 0000000..7f6b47b
Binary files /dev/null and b/frontend/public/kofi.png differ
diff --git a/poetry.lock b/poetry.lock
index bf12617..b00709e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.5.0 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
[[package]]
name = "annotated-types"
@@ -189,6 +189,45 @@ tests = ["attrs[tests-no-zope]", "zope-interface"]
tests-mypy = ["mypy (>=1.6)", "pytest-mypy-plugins"]
tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-xdist[psutil]"]
+[[package]]
+name = "azure-core"
+version = "1.30.2"
+description = "Microsoft Azure Core Library for Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "azure-core-1.30.2.tar.gz", hash = "sha256:a14dc210efcd608821aa472d9fb8e8d035d29b68993819147bc290a8ac224472"},
+ {file = "azure_core-1.30.2-py3-none-any.whl", hash = "sha256:cf019c1ca832e96274ae85abd3d9f752397194d9fea3b41487290562ac8abe4a"},
+]
+
+[package.dependencies]
+requests = ">=2.21.0"
+six = ">=1.11.0"
+typing-extensions = ">=4.6.0"
+
+[package.extras]
+aio = ["aiohttp (>=3.0)"]
+
+[[package]]
+name = "azure-storage-blob"
+version = "12.20.0"
+description = "Microsoft Azure Blob Storage Client Library for Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "azure-storage-blob-12.20.0.tar.gz", hash = "sha256:eeb91256e41d4b5b9bad6a87fd0a8ade07dd58aa52344e2c8d2746e27a017d3b"},
+ {file = "azure_storage_blob-12.20.0-py3-none-any.whl", hash = "sha256:de6b3bf3a90e9341a6bcb96a2ebe981dffff993e9045818f6549afea827a52a9"},
+]
+
+[package.dependencies]
+azure-core = ">=1.28.0"
+cryptography = ">=2.1.4"
+isodate = ">=0.6.1"
+typing-extensions = ">=4.6.0"
+
+[package.extras]
+aio = ["azure-core[aio] (>=1.28.0)"]
+
[[package]]
name = "babel"
version = "2.15.0"
@@ -572,6 +611,60 @@ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.1
[package.extras]
toml = ["tomli"]
+[[package]]
+name = "cryptography"
+version = "42.0.8"
+description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "cryptography-42.0.8-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:81d8a521705787afe7a18d5bfb47ea9d9cc068206270aad0b96a725022e18d2e"},
+ {file = "cryptography-42.0.8-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:961e61cefdcb06e0c6d7e3a1b22ebe8b996eb2bf50614e89384be54c48c6b63d"},
+ {file = "cryptography-42.0.8-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3ec3672626e1b9e55afd0df6d774ff0e953452886e06e0f1eb7eb0c832e8902"},
+ {file = "cryptography-42.0.8-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e599b53fd95357d92304510fb7bda8523ed1f79ca98dce2f43c115950aa78801"},
+ {file = "cryptography-42.0.8-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:5226d5d21ab681f432a9c1cf8b658c0cb02533eece706b155e5fbd8a0cdd3949"},
+ {file = "cryptography-42.0.8-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:6b7c4f03ce01afd3b76cf69a5455caa9cfa3de8c8f493e0d3ab7d20611c8dae9"},
+ {file = "cryptography-42.0.8-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:2346b911eb349ab547076f47f2e035fc8ff2c02380a7cbbf8d87114fa0f1c583"},
+ {file = "cryptography-42.0.8-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:ad803773e9df0b92e0a817d22fd8a3675493f690b96130a5e24f1b8fabbea9c7"},
+ {file = "cryptography-42.0.8-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:2f66d9cd9147ee495a8374a45ca445819f8929a3efcd2e3df6428e46c3cbb10b"},
+ {file = "cryptography-42.0.8-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d45b940883a03e19e944456a558b67a41160e367a719833c53de6911cabba2b7"},
+ {file = "cryptography-42.0.8-cp37-abi3-win32.whl", hash = "sha256:a0c5b2b0585b6af82d7e385f55a8bc568abff8923af147ee3c07bd8b42cda8b2"},
+ {file = "cryptography-42.0.8-cp37-abi3-win_amd64.whl", hash = "sha256:57080dee41209e556a9a4ce60d229244f7a66ef52750f813bfbe18959770cfba"},
+ {file = "cryptography-42.0.8-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:dea567d1b0e8bc5764b9443858b673b734100c2871dc93163f58c46a97a83d28"},
+ {file = "cryptography-42.0.8-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4783183f7cb757b73b2ae9aed6599b96338eb957233c58ca8f49a49cc32fd5e"},
+ {file = "cryptography-42.0.8-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0608251135d0e03111152e41f0cc2392d1e74e35703960d4190b2e0f4ca9c70"},
+ {file = "cryptography-42.0.8-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:dc0fdf6787f37b1c6b08e6dfc892d9d068b5bdb671198c72072828b80bd5fe4c"},
+ {file = "cryptography-42.0.8-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:9c0c1716c8447ee7dbf08d6db2e5c41c688544c61074b54fc4564196f55c25a7"},
+ {file = "cryptography-42.0.8-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:fff12c88a672ab9c9c1cf7b0c80e3ad9e2ebd9d828d955c126be4fd3e5578c9e"},
+ {file = "cryptography-42.0.8-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:cafb92b2bc622cd1aa6a1dce4b93307792633f4c5fe1f46c6b97cf67073ec961"},
+ {file = "cryptography-42.0.8-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:31f721658a29331f895a5a54e7e82075554ccfb8b163a18719d342f5ffe5ecb1"},
+ {file = "cryptography-42.0.8-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b297f90c5723d04bcc8265fc2a0f86d4ea2e0f7ab4b6994459548d3a6b992a14"},
+ {file = "cryptography-42.0.8-cp39-abi3-win32.whl", hash = "sha256:2f88d197e66c65be5e42cd72e5c18afbfae3f741742070e3019ac8f4ac57262c"},
+ {file = "cryptography-42.0.8-cp39-abi3-win_amd64.whl", hash = "sha256:fa76fbb7596cc5839320000cdd5d0955313696d9511debab7ee7278fc8b5c84a"},
+ {file = "cryptography-42.0.8-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:ba4f0a211697362e89ad822e667d8d340b4d8d55fae72cdd619389fb5912eefe"},
+ {file = "cryptography-42.0.8-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:81884c4d096c272f00aeb1f11cf62ccd39763581645b0812e99a91505fa48e0c"},
+ {file = "cryptography-42.0.8-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c9bb2ae11bfbab395bdd072985abde58ea9860ed84e59dbc0463a5d0159f5b71"},
+ {file = "cryptography-42.0.8-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:7016f837e15b0a1c119d27ecd89b3515f01f90a8615ed5e9427e30d9cdbfed3d"},
+ {file = "cryptography-42.0.8-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5a94eccb2a81a309806027e1670a358b99b8fe8bfe9f8d329f27d72c094dde8c"},
+ {file = "cryptography-42.0.8-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:dec9b018df185f08483f294cae6ccac29e7a6e0678996587363dc352dc65c842"},
+ {file = "cryptography-42.0.8-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:343728aac38decfdeecf55ecab3264b015be68fc2816ca800db649607aeee648"},
+ {file = "cryptography-42.0.8-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:013629ae70b40af70c9a7a5db40abe5d9054e6f4380e50ce769947b73bf3caad"},
+ {file = "cryptography-42.0.8.tar.gz", hash = "sha256:8d09d05439ce7baa8e9e95b07ec5b6c886f548deb7e0f69ef25f64b3bce842f2"},
+]
+
+[package.dependencies]
+cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""}
+
+[package.extras]
+docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
+docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"]
+nox = ["nox"]
+pep8test = ["check-sdist", "click", "mypy", "ruff"]
+sdist = ["build"]
+ssh = ["bcrypt (>=3.1.5)"]
+test = ["certifi", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
+test-randomorder = ["pytest-randomly"]
+
[[package]]
name = "debugpy"
version = "1.8.1"
@@ -625,6 +718,23 @@ files = [
{file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
]
+[[package]]
+name = "deprecated"
+version = "1.2.14"
+description = "Python @deprecated decorator to deprecate old python classes, functions or methods."
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+ {file = "Deprecated-1.2.14-py2.py3-none-any.whl", hash = "sha256:6fac8b097794a90302bdbb17b9b815e732d3c4720583ff1b198499d78470466c"},
+ {file = "Deprecated-1.2.14.tar.gz", hash = "sha256:e5323eb936458dccc2582dc6f9c322c852a775a27065ff2b0c4970b9d53d01b3"},
+]
+
+[package.dependencies]
+wrapt = ">=1.10,<2"
+
+[package.extras]
+dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
+
[[package]]
name = "deptry"
version = "0.12.0"
@@ -1190,6 +1300,20 @@ qtconsole = ["qtconsole"]
test = ["pytest (<7.1)", "pytest-asyncio", "testpath"]
test-extra = ["curio", "matplotlib (!=3.2.0)", "nbformat", "numpy (>=1.21)", "pandas", "pytest (<7.1)", "pytest-asyncio", "testpath", "trio"]
+[[package]]
+name = "isodate"
+version = "0.6.1"
+description = "An ISO 8601 date/time/duration parser and formatter"
+optional = false
+python-versions = "*"
+files = [
+ {file = "isodate-0.6.1-py2.py3-none-any.whl", hash = "sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96"},
+ {file = "isodate-0.6.1.tar.gz", hash = "sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9"},
+]
+
+[package.dependencies]
+six = "*"
+
[[package]]
name = "isoduration"
version = "20.11.0"
@@ -1530,6 +1654,35 @@ docs = ["autodoc-traits", "jinja2 (<3.2.0)", "mistune (<4)", "myst-parser", "pyd
openapi = ["openapi-core (>=0.18.0,<0.19.0)", "ruamel-yaml"]
test = ["hatch", "ipykernel", "openapi-core (>=0.18.0,<0.19.0)", "openapi-spec-validator (>=0.6.0,<0.8.0)", "pytest (>=7.0,<8)", "pytest-console-scripts", "pytest-cov", "pytest-jupyter[server] (>=0.6.2)", "pytest-timeout", "requests-mock", "ruamel-yaml", "sphinxcontrib-spelling", "strict-rfc3339", "werkzeug"]
+[[package]]
+name = "limits"
+version = "3.12.0"
+description = "Rate limiting utilities"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "limits-3.12.0-py3-none-any.whl", hash = "sha256:48d91e94a0888fb1251aa31423d716ae72ceff997231363f7968a5eaa51dc56d"},
+ {file = "limits-3.12.0.tar.gz", hash = "sha256:95764065715a11b9fdcc82558cac2fb59a1febbb7aa2acd045f72ab0c16ec04f"},
+]
+
+[package.dependencies]
+deprecated = ">=1.2"
+importlib-resources = ">=1.3"
+packaging = ">=21,<25"
+typing-extensions = "*"
+
+[package.extras]
+all = ["aetcd", "coredis (>=3.4.0,<5)", "emcache (>=0.6.1)", "emcache (>=1)", "etcd3", "motor (>=3,<4)", "pymemcache (>3,<5.0.0)", "pymongo (>4.1,<5)", "redis (>3,!=4.5.2,!=4.5.3,<6.0.0)", "redis (>=4.2.0,!=4.5.2,!=4.5.3)"]
+async-etcd = ["aetcd"]
+async-memcached = ["emcache (>=0.6.1)", "emcache (>=1)"]
+async-mongodb = ["motor (>=3,<4)"]
+async-redis = ["coredis (>=3.4.0,<5)"]
+etcd = ["etcd3"]
+memcached = ["pymemcache (>3,<5.0.0)"]
+mongodb = ["pymongo (>4.1,<5)"]
+redis = ["redis (>3,!=4.5.2,!=4.5.3,<6.0.0)"]
+rediscluster = ["redis (>=4.2.0,!=4.5.2,!=4.5.3)"]
+
[[package]]
name = "lxml"
version = "5.2.2"
@@ -3861,6 +4014,23 @@ files = [
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
]
+[[package]]
+name = "slowapi"
+version = "0.1.9"
+description = "A rate limiting extension for Starlette and Fastapi"
+optional = false
+python-versions = ">=3.7,<4.0"
+files = [
+ {file = "slowapi-0.1.9-py3-none-any.whl", hash = "sha256:cfad116cfb84ad9d763ee155c1e5c5cbf00b0d47399a769b227865f5df576e36"},
+ {file = "slowapi-0.1.9.tar.gz", hash = "sha256:639192d0f1ca01b1c6d95bf6c71d794c3a9ee189855337b4821f7f457dddad77"},
+]
+
+[package.dependencies]
+limits = ">=2.3"
+
+[package.extras]
+redis = ["redis (>=3.4.1,<4.0.0)"]
+
[[package]]
name = "sniffio"
version = "1.3.1"
@@ -4876,6 +5046,85 @@ files = [
[package.extras]
test = ["pytest (>=6.0.0)", "setuptools (>=65)"]
+[[package]]
+name = "wrapt"
+version = "1.16.0"
+description = "Module for decorators, wrappers and monkey patching."
+optional = false
+python-versions = ">=3.6"
+files = [
+ {file = "wrapt-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4"},
+ {file = "wrapt-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020"},
+ {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440"},
+ {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487"},
+ {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf"},
+ {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72"},
+ {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0"},
+ {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136"},
+ {file = "wrapt-1.16.0-cp310-cp310-win32.whl", hash = "sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d"},
+ {file = "wrapt-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2"},
+ {file = "wrapt-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09"},
+ {file = "wrapt-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d"},
+ {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389"},
+ {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060"},
+ {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1"},
+ {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3"},
+ {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956"},
+ {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d"},
+ {file = "wrapt-1.16.0-cp311-cp311-win32.whl", hash = "sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362"},
+ {file = "wrapt-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89"},
+ {file = "wrapt-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b"},
+ {file = "wrapt-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36"},
+ {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73"},
+ {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809"},
+ {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b"},
+ {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81"},
+ {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9"},
+ {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c"},
+ {file = "wrapt-1.16.0-cp312-cp312-win32.whl", hash = "sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc"},
+ {file = "wrapt-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8"},
+ {file = "wrapt-1.16.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8"},
+ {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39"},
+ {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c"},
+ {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40"},
+ {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc"},
+ {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e"},
+ {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465"},
+ {file = "wrapt-1.16.0-cp36-cp36m-win32.whl", hash = "sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e"},
+ {file = "wrapt-1.16.0-cp36-cp36m-win_amd64.whl", hash = "sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966"},
+ {file = "wrapt-1.16.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593"},
+ {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292"},
+ {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5"},
+ {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf"},
+ {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228"},
+ {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f"},
+ {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c"},
+ {file = "wrapt-1.16.0-cp37-cp37m-win32.whl", hash = "sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c"},
+ {file = "wrapt-1.16.0-cp37-cp37m-win_amd64.whl", hash = "sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00"},
+ {file = "wrapt-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0"},
+ {file = "wrapt-1.16.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202"},
+ {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0"},
+ {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e"},
+ {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f"},
+ {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267"},
+ {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca"},
+ {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6"},
+ {file = "wrapt-1.16.0-cp38-cp38-win32.whl", hash = "sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b"},
+ {file = "wrapt-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41"},
+ {file = "wrapt-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2"},
+ {file = "wrapt-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb"},
+ {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8"},
+ {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c"},
+ {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a"},
+ {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664"},
+ {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f"},
+ {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537"},
+ {file = "wrapt-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3"},
+ {file = "wrapt-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35"},
+ {file = "wrapt-1.16.0-py3-none-any.whl", hash = "sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1"},
+ {file = "wrapt-1.16.0.tar.gz", hash = "sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d"},
+]
+
[[package]]
name = "zipp"
version = "3.19.2"
@@ -4894,4 +5143,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
[metadata]
lock-version = "2.0"
python-versions = ">=3.8,<4.0"
-content-hash = "94b4710b0b5d989f54293ad80f03d6fd811a76b27f1d9ab6253b36971f769f58"
+content-hash = "37d93dd135cc2322805eca294629bfc4414851c85c813caf127736c17be30067"
diff --git a/pypi_scout/api/main.py b/pypi_scout/api/main.py
index eb4870c..5e2ad64 100644
--- a/pypi_scout/api/main.py
+++ b/pypi_scout/api/main.py
@@ -1,10 +1,15 @@
import logging
+import polars as pl
from dotenv import load_dotenv
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
+from slowapi import Limiter, _rate_limit_exceeded_handler
+from slowapi.errors import RateLimitExceeded
+from slowapi.util import get_remote_address
+from starlette.requests import Request
from pypi_scout.api.utils import load_dataset
from pypi_scout.config import Config
@@ -12,28 +17,31 @@
from pypi_scout.utils.score_calculator import calculate_score
from pypi_scout.vector_database import VectorDatabaseInterface
+# Setup logging
setup_logging()
logging.info("Initializing backend...")
+# Initialize limiter
+limiter = Limiter(key_func=get_remote_address)
app = FastAPI()
+app.state.limiter = limiter
+app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
+# Load environment variables and configuration
load_dotenv()
config = Config()
-origins = [
- "http://localhost:3000",
-]
-
+# Add CORS middleware
app.add_middleware(
CORSMiddleware,
- allow_origins=origins,
+ allow_origins=["*"], # Temporary wildcard for testing
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
-df = load_dataset(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
-
+# Load dataset and initialize model and vector database interface
+df = load_dataset(config)
model = SentenceTransformer(config.EMBEDDINGS_MODEL_NAME)
vector_database_interface = VectorDatabaseInterface(
@@ -58,39 +66,49 @@ class Match(BaseModel):
class SearchResponse(BaseModel):
matches: list[Match]
+ warning: bool = False
+ warning_message: str = None
-@app.post("/search/", response_model=SearchResponse)
-async def search(query: QueryModel):
+@app.post("/api/search", response_model=SearchResponse)
+@limiter.limit("4/minute")
+async def search(query: QueryModel, request: Request):
"""
Search for the packages whose summary and description have the highest similarity to the query.
We take the top_k * 2 most similar packages, and then calculate weighted score based on the similarity and weekly downloads.
The top_k packages with the highest score are returned.
"""
+ if query.top_k > 60:
+ raise HTTPException(status_code=400, detail="top_k cannot be larger than 100.")
+
logging.info(f"Searching for similar projects. Query: '{query.query}'")
df_matches = vector_database_interface.find_similar(query.query, top_k=query.top_k * 2)
df_matches = df_matches.join(df, how="left", on="name")
-
- if df_matches["weekly_downloads"].is_null().any():
- logging.error(
- "One or more entries have 'None' for 'weekly_downloads'. "
- "This means they were found in the vector database but not in the local dataset."
- )
- logging.error(
- "The most likely cause is that the local dataset was generated with a lower config.FRAC_DATA_TO_INCLUDE "
- "value than the vector database."
- )
- logging.error("To solve this, delete the Pinecone index and rerun the setup script.")
- raise HTTPException(status_code=400, detail="One or more entries have 'None' for 'weekly_downloads'.")
-
logging.info(
f"Fetched the {len(df_matches)} most similar projects. Calculating the weighted scores and filtering..."
)
+
+ warning = False
+ warning_message = ""
+ matches_missing_in_local_dataset = df_matches.filter(pl.col("weekly_downloads").is_null())["name"].to_list()
+ if matches_missing_in_local_dataset:
+ warning = True
+ warning_message = (
+ f"The following entries have 'None' for 'weekly_downloads': {matches_missing_in_local_dataset}. "
+ "These entries were found in the vector database but not in the local dataset and have been excluded from the results."
+ )
+ logging.error(warning_message)
+ df_matches = df_matches.filter(~pl.col("name").is_in(matches_missing_in_local_dataset))
+
df_matches = calculate_score(
df_matches, weight_similarity=config.WEIGHT_SIMILARITY, weight_weekly_downloads=config.WEIGHT_WEEKLY_DOWNLOADS
)
df_matches = df_matches.sort("score", descending=True)
- df_matches = df_matches.head(query.top_k)
+
+ if len(df_matches) > query.top_k:
+ df_matches = df_matches.head(query.top_k)
+
logging.info(f"Returning the {len(df_matches)} best matches.")
- return SearchResponse(matches=df_matches.to_dicts())
+
+ return SearchResponse(matches=df_matches.to_dicts(), warning=warning, warning_message=warning_message)
diff --git a/pypi_scout/api/utils.py b/pypi_scout/api/utils.py
index cab3e58..efa70a4 100644
--- a/pypi_scout/api/utils.py
+++ b/pypi_scout/api/utils.py
@@ -1,12 +1,37 @@
import logging
-from pathlib import Path
+import sys
import polars as pl
+from pypi_scout.config import Config, StorageBackend
+from pypi_scout.utils.blob_io import BlobIO
+
+
+def load_dataset(config: Config) -> pl.DataFrame:
+ dataset_path = config.DATA_DIR / config.DATASET_FOR_API_CSV_NAME
+
+ if dataset_path.exists():
+ logging.info(f"Found local dataset. Reading dataset from `{dataset_path}`...")
+ df = pl.read_csv(dataset_path)
+
+ elif config.STORAGE_BACKEND == StorageBackend.BLOB:
+ logging.info(
+ f"Downloading `{config.DATASET_FOR_API_CSV_NAME}` from container `{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}`..."
+ )
+ blob_io = BlobIO(
+ config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME,
+ config.STORAGE_BACKEND_BLOB_CONTAINER_NAME,
+ config.STORAGE_BACKEND_BLOB_KEY,
+ )
+ df = blob_io.download_csv(config.DATASET_FOR_API_CSV_NAME)
+ logging.info("Finished downloading.")
+
+ else:
+ logging.error(
+ f"Dataset {dataset_path} not found, and config.StorageBackend is not `BLOB` so can't download the dataset from Azure. Terminating."
+ )
+ sys.exit(1)
-def load_dataset(path_to_dataset: Path):
- logging.info("Loading the processed dataset...")
- df = pl.read_csv(path_to_dataset)
logging.info(f"Finished loading the processed dataset. Number of rows: {len(df):,}")
logging.info(f"The highest weekly downloads in the dataset: {df['weekly_downloads'].max():,}")
logging.info(f"The lowest weekly downloads in the dataset: {df['weekly_downloads'].min():,}")
diff --git a/pypi_scout/config.py b/pypi_scout/config.py
index e4d3a48..671c054 100644
--- a/pypi_scout/config.py
+++ b/pypi_scout/config.py
@@ -1,8 +1,14 @@
import os
from dataclasses import dataclass, field
+from enum import Enum
from pathlib import Path
+class StorageBackend(Enum):
+ LOCAL = "LOCAL"
+ BLOB = "BLOB"
+
+
@dataclass
class Config:
# Name of the Pinecone index used for storing vector representations of the package descriptions.
@@ -21,6 +27,9 @@ class Config:
# Dimension of the vector embeddings produced by the model. Should match the output of the model above.
EMBEDDINGS_DIMENSION = 768
+ # Boolean to overwrite existing files. e.g. re-download the raw dataset, upload processed dataset to blob, etc.
+ OVERWRITE: bool = True
+
# Directory where dataset files are stored.
DATA_DIR: Path = Path("data")
@@ -30,6 +39,10 @@ class Config:
# Filename for the processed dataset CSV.
PROCESSED_DATASET_CSV_NAME = "processed_dataset.csv"
+ # Filename for the dataset that contains the minimal data that the API needs.
+ # For example; it needs the name, weekly downloads, and the summary, but not the (cleaned) description.
+ DATASET_FOR_API_CSV_NAME = "dataset_for_api.csv"
+
# Google Drive file ID for downloading the raw dataset.
GOOGLE_FILE_ID = "1huR7-VD3AieBRCcQyRX9MWbPLMb_czjq"
@@ -44,9 +57,33 @@ class Config:
# Weights for the combined score calculation. Higher WEIGHT_SIMILARITY prioritizes
# relevance based on text similarity, while higher WEIGHT_WEEKLY_DOWNLOADS prioritizes
# packages with more weekly downloads.
- WEIGHT_SIMILARITY = 0.8
- WEIGHT_WEEKLY_DOWNLOADS = 0.2
+ WEIGHT_SIMILARITY = 0.6
+ WEIGHT_WEEKLY_DOWNLOADS = 0.4
+
+ # Storage backend configuration. Can be either StorageBackend.LOCAL or StorageBackend.BLOB.
+ # If StorageBackend.BLOB, the processed dataset will be uploaded to Blob, and the backend API
+ # will read the data from there, rather than from a local data directory. In order to use StorageBackend.BLOB,
+ # the other `STORAGE_BACKEND_BLOB_` variables need to be set as environment variables.
+ STORAGE_BACKEND: StorageBackend = StorageBackend.LOCAL
+ STORAGE_BACKEND_BLOB_ACCOUNT_NAME: str | None = None
+ STORAGE_BACKEND_BLOB_CONTAINER_NAME: str | None = None
+ STORAGE_BACKEND_BLOB_KEY: str | None = None
def __post_init__(self) -> None:
if not self.PINECONE_TOKEN:
raise OSError("PINECONE_TOKEN not found in environment variables") # noqa: TRY003
+
+ if os.getenv("STORAGE_BACKEND") == "BLOB":
+ self.STORAGE_BACKEND = StorageBackend.BLOB
+ self.STORAGE_BACKEND_BLOB_ACCOUNT_NAME = os.getenv("STORAGE_BACKEND_BLOB_ACCOUNT_NAME")
+ self.STORAGE_BACKEND_BLOB_CONTAINER_NAME = os.getenv("STORAGE_BACKEND_BLOB_CONTAINER_NAME")
+ self.STORAGE_BACKEND_BLOB_KEY = os.getenv("STORAGE_BACKEND_BLOB_KEY")
+
+ if not all(
+ [
+ self.STORAGE_BACKEND_BLOB_ACCOUNT_NAME,
+ self.STORAGE_BACKEND_BLOB_CONTAINER_NAME,
+ self.STORAGE_BACKEND_BLOB_KEY,
+ ]
+ ):
+ raise OSError("One or more BLOB storage environment variables are missing!") # noqa: TRY003
diff --git a/pypi_scout/data/reader.py b/pypi_scout/data/raw_data_reader.py
similarity index 97%
rename from pypi_scout/data/reader.py
rename to pypi_scout/data/raw_data_reader.py
index 4a31e9f..f0de2ba 100644
--- a/pypi_scout/data/reader.py
+++ b/pypi_scout/data/raw_data_reader.py
@@ -5,7 +5,7 @@
@dataclass
-class DataReader:
+class RawDataReader:
"""
A class for reading and processing data from a raw PyPI dataset.
"""
diff --git a/pypi_scout/scripts/download_dataset.py b/pypi_scout/scripts/download_raw_dataset.py
similarity index 62%
rename from pypi_scout/scripts/download_dataset.py
rename to pypi_scout/scripts/download_raw_dataset.py
index 58a0c94..e10a81f 100644
--- a/pypi_scout/scripts/download_dataset.py
+++ b/pypi_scout/scripts/download_raw_dataset.py
@@ -7,7 +7,7 @@
from pypi_scout.utils.logging import setup_logging
-def download_dataset():
+def download_raw_dataset():
"""
Downloads the dataset from a Google Drive link using the gdown library.
"""
@@ -16,8 +16,13 @@ def download_dataset():
target_path = config.DATA_DIR / config.RAW_DATASET_CSV_NAME
if target_path.exists():
- logging.info(f"โ๏ธ Raw dataset {target_path} from Google Drive already exists! Skipping download.")
- return
+ if not config.OVERWRITE:
+ logging.info(f"๐น Raw dataset {target_path} from Google Drive already exists! Skipping download.")
+ return
+ else:
+ logging.info(
+ f"โคต๏ธ Raw dataset {target_path} from Google Drive exists, but config.OVERWRITE is `true`. Overwriting..."
+ )
logging.info(f"โฌ๏ธ Downloading raw dataset from Google Drive to {target_path}...")
url = f"https://drive.google.com/uc?id={config.GOOGLE_FILE_ID}"
@@ -27,4 +32,4 @@ def download_dataset():
if __name__ == "__main__":
setup_logging()
- download_dataset()
+ download_raw_dataset()
diff --git a/pypi_scout/scripts/process_dataset.py b/pypi_scout/scripts/process_raw_dataset.py
similarity index 79%
rename from pypi_scout/scripts/process_dataset.py
rename to pypi_scout/scripts/process_raw_dataset.py
index 8bc0473..ca68d0f 100644
--- a/pypi_scout/scripts/process_dataset.py
+++ b/pypi_scout/scripts/process_raw_dataset.py
@@ -5,14 +5,14 @@
from pypi_scout.config import Config
from pypi_scout.data.description_cleaner import CLEANING_FAILED, DescriptionCleaner
-from pypi_scout.data.reader import DataReader
+from pypi_scout.data.raw_data_reader import RawDataReader
from pypi_scout.utils.logging import setup_logging
def read_raw_dataset(path_to_raw_dataset):
logging.info("๐ Reading the raw dataset...")
- df = DataReader(path_to_raw_dataset).read()
- logging.info("๐ Number of rows in the raw dataset: %s", len(df))
+ df = RawDataReader(path_to_raw_dataset).read()
+ logging.info(f"๐ Number of rows in the raw dataset: {len(df):,}")
logging.info(f"The highest weekly downloads in the raw dataset: {df['weekly_downloads'].max():,}")
logging.info(f"The lowest weekly downloads in the raw dataset: {df['weekly_downloads'].min():,}")
return df
@@ -42,22 +42,24 @@ def clean_descriptions(df):
return df
-def store_processed_dataset(df, processed_dataset_path):
- logging.info("Storing the processed dataset...")
+def write_csv(df, processed_dataset_path):
+ logging.info(f"Storing dataset in {processed_dataset_path}...")
df.write_csv(processed_dataset_path)
logging.info("โ
Done!")
-def process_dataset():
+def process_raw_dataset():
load_dotenv()
config = Config()
df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME)
if config.FRAC_DATA_TO_INCLUDE < 1.0:
df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE)
df = clean_descriptions(df)
- store_processed_dataset(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
+
+ write_csv(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
+ write_csv(df.select(["name", "summary", "weekly_downloads"]), config.DATA_DIR / config.DATASET_FOR_API_CSV_NAME)
if __name__ == "__main__":
setup_logging()
- process_dataset()
+ process_raw_dataset()
diff --git a/pypi_scout/scripts/setup.py b/pypi_scout/scripts/setup.py
index 680b5ed..795e201 100644
--- a/pypi_scout/scripts/setup.py
+++ b/pypi_scout/scripts/setup.py
@@ -1,18 +1,30 @@
import argparse
+import logging
-from pypi_scout.scripts.download_dataset import download_dataset
-from pypi_scout.scripts.process_dataset import process_dataset
+from pypi_scout.scripts.download_raw_dataset import download_raw_dataset
+from pypi_scout.scripts.process_raw_dataset import process_raw_dataset
from pypi_scout.scripts.setup_pinecone import setup_pinecone
+from pypi_scout.scripts.upload_processed_datasets import upload_processed_datasets
from pypi_scout.scripts.upsert_data import upsert_data
from pypi_scout.utils.logging import setup_logging
def main(no_upsert):
setup_logging()
+
+ logging.info("\n\nSETTING UP PINECONE -------------\n")
setup_pinecone()
- download_dataset()
- process_dataset()
+
+ logging.info("\n\nDOWNLOADING RAW DATASET -------------\n")
+ download_raw_dataset()
+
+ logging.info("\n\nPROCESSING RAW DATASET -------------\n")
+ process_raw_dataset()
+
+ logging.info("\n\nUPLOADING PROCESSED DATASETS -------------\n")
+ upload_processed_datasets()
if not no_upsert:
+ logging.info("\n\nUPSERTING DATA TO PINECONE -------------\n")
upsert_data()
diff --git a/pypi_scout/scripts/setup_pinecone.py b/pypi_scout/scripts/setup_pinecone.py
index c3d12f6..4f3b1f2 100644
--- a/pypi_scout/scripts/setup_pinecone.py
+++ b/pypi_scout/scripts/setup_pinecone.py
@@ -33,7 +33,7 @@ def setup_pinecone():
logging.info("โ
Pinecone index created successfully.")
except PineconeApiException as e:
if e.status == 409:
- logging.warning(f"โ๏ธ Pinecone index '{config.PINECONE_INDEX_NAME}' already exists.")
+ logging.warning(f"๐น Pinecone index '{config.PINECONE_INDEX_NAME}' already exists.")
else:
logging.exception("โ An error occurred while creating the Pinecone index.")
diff --git a/pypi_scout/scripts/upload_processed_datasets.py b/pypi_scout/scripts/upload_processed_datasets.py
new file mode 100644
index 0000000..4d94043
--- /dev/null
+++ b/pypi_scout/scripts/upload_processed_datasets.py
@@ -0,0 +1,69 @@
+import logging
+from pathlib import Path
+
+import polars as pl
+from dotenv import load_dotenv
+
+from pypi_scout.config import Config, StorageBackend
+from pypi_scout.utils.blob_io import BlobIO
+from pypi_scout.utils.logging import setup_logging
+
+
+class CsvToBlobUploader:
+ def __init__(self, config: Config):
+ self.config = config
+ self.blob_io = BlobIO(
+ config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME,
+ config.STORAGE_BACKEND_BLOB_CONTAINER_NAME,
+ config.STORAGE_BACKEND_BLOB_KEY,
+ )
+ self.overwrite = config.OVERWRITE
+
+ def read_csv(self, path_to_csv: Path) -> pl.DataFrame:
+ logging.info(f"๐ Reading the dataset from {path_to_csv}...")
+ df = pl.read_csv(path_to_csv)
+ logging.info(f"๐ Number of rows in the dataset {path_to_csv.name}: {len(df):,}")
+ return df
+
+ def upload_csv_to_blob(self, df: pl.DataFrame, csv_name: str):
+ if self.blob_io.exists(csv_name):
+ if not self.overwrite:
+ logging.info(
+ f"๐น Dataset {csv_name} already exists in container '{self.config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}'! Skipping upload."
+ )
+ return
+ else:
+ logging.info(
+ f"โคต๏ธ Dataset {csv_name} already exists in container '{self.config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}', but overwrite is enabled. Overwriting..."
+ )
+
+ logging.info(f"Uploading {csv_name} to blob container {self.config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}...")
+ self.blob_io.upload_csv(df, csv_name)
+ logging.info("โ
Done!")
+
+ def process_and_upload_datasets(self, dataset_names: list[str]):
+ for dataset_name in dataset_names:
+ csv_path = self.config.DATA_DIR / dataset_name
+ df = self.read_csv(csv_path)
+ self.upload_csv_to_blob(df, dataset_name)
+
+
+def upload_processed_datasets():
+ load_dotenv()
+ config = Config()
+
+ if config.STORAGE_BACKEND != StorageBackend.BLOB:
+ logging.info(
+ "Not using BLOB backend. Skipping upload. To enable, configure the `STORAGE_BACKEND_` variables in config"
+ )
+ return
+
+ dataset_names = [config.PROCESSED_DATASET_CSV_NAME, config.DATASET_FOR_API_CSV_NAME]
+
+ uploader = CsvToBlobUploader(config)
+ uploader.process_and_upload_datasets(dataset_names)
+
+
+if __name__ == "__main__":
+ setup_logging()
+ upload_processed_datasets()
diff --git a/pypi_scout/utils/blob_io.py b/pypi_scout/utils/blob_io.py
new file mode 100644
index 0000000..74eaa5e
--- /dev/null
+++ b/pypi_scout/utils/blob_io.py
@@ -0,0 +1,47 @@
+import tempfile
+from io import BytesIO
+
+import polars as pl
+from azure.storage.blob import BlobServiceClient
+
+
+class BlobIO:
+ def __init__(self, account_name: str, container_name: str, account_key: str):
+ self.account_name = account_name
+ self.container_name = container_name
+ self.account_key = account_key
+ self.service_client = BlobServiceClient(
+ account_url=f"https://{account_name}.blob.core.windows.net", credential=account_key
+ )
+ self.container_client = self.service_client.get_container_client(container_name)
+
+ def upload_csv(self, data_frame: pl.DataFrame, blob_name: str) -> None:
+ csv_buffer = BytesIO()
+ data_frame.write_csv(csv_buffer)
+ csv_buffer.seek(0) # Reset buffer position to the beginning
+ blob_client = self.container_client.get_blob_client(blob_name)
+ blob_client.upload_blob(csv_buffer, overwrite=True)
+
+ def upload_local_csv(self, local_file_path: str, blob_name: str) -> None:
+ with open(local_file_path, "rb") as data:
+ blob_client = self.container_client.get_blob_client(blob_name)
+ blob_client.upload_blob(data, overwrite=True)
+
+ def download_csv(self, blob_name: str) -> pl.DataFrame:
+ blob_client = self.container_client.get_blob_client(blob_name)
+ download_stream = blob_client.download_blob()
+
+ # Create a temporary file
+ with tempfile.NamedTemporaryFile(delete=True) as temp_file:
+ # Download the blob content into the temporary file
+ temp_file.write(download_stream.readall())
+ temp_file.flush()
+
+ # Read the CSV using Polars
+ df = pl.read_csv(temp_file.name)
+
+ return df
+
+ def exists(self, blob_name):
+ blob_client = self.container_client.get_blob_client(blob_name)
+ return blob_client.exists()
diff --git a/pypi_scout/utils/logging.py b/pypi_scout/utils/logging.py
index 2498547..997cace 100644
--- a/pypi_scout/utils/logging.py
+++ b/pypi_scout/utils/logging.py
@@ -2,6 +2,8 @@
def setup_logging() -> None:
+ logging.getLogger("azure").setLevel(logging.WARNING)
+
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
diff --git a/pypi_scout/vector_database/interface.py b/pypi_scout/vector_database/interface.py
index fbadff1..3293231 100644
--- a/pypi_scout/vector_database/interface.py
+++ b/pypi_scout/vector_database/interface.py
@@ -32,6 +32,7 @@ def __init__(
pc = Pinecone(api_key=pinecone_token)
self.index = pc.Index(pinecone_index_name)
self.pinecone_namespace = pinecone_namespace
+ logging.info("Connection successful.")
def upsert_polars(self, df: pl.DataFrame, key_column: str, text_column: str):
"""
diff --git a/pyproject.toml b/pyproject.toml
index de2e2fe..aecb24e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,6 +23,9 @@ fastapi = "^0.111.0"
pydantic = "^2.7.4"
uvicorn = "^0.30.1"
gdown = "^5.2.0"
+azure-storage-blob = "^12.20.0"
+slowapi = "^0.1.9"
+starlette = "^0.37.2"
[tool.poetry.group.dev.dependencies]
pytest = "^7.2.0"
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 26efca3..2c10e0e 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -12,5 +12,9 @@ pydantic==2.7.4
uvicorn==0.30.1
gdown==5.2.0
torch==2.0.1
+numpy==1.24.4
+azure-storage-blob==12.20.0
+slowapi==0.1.9
+starlette==0.37.2
--index-url=https://download.pytorch.org/whl/cpu
--extra-index-url=https://pypi.org/simple
diff --git a/start.sh b/start.sh
deleted file mode 100755
index a8cfd13..0000000
--- a/start.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/sh
-set -e
-python pypi_scout/scripts/setup.py --no-upsert
-uvicorn pypi_scout.api.main:app --host 0.0.0.0 --port 8000