From 6ab0f89f7395097fcbe3a8f191c9144d990670be Mon Sep 17 00:00:00 2001 From: Florian Maas Date: Tue, 18 Jun 2024 10:03:28 +0200 Subject: [PATCH 01/22] try aks --- frontend/app/utils/search.ts | 9 +++++---- frontend/next.config.mjs | 6 +++++- pypi_scout/api/main.py | 1 + 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/frontend/app/utils/search.ts b/frontend/app/utils/search.ts index 56c652a..d99e8f0 100644 --- a/frontend/app/utils/search.ts +++ b/frontend/app/utils/search.ts @@ -7,6 +7,8 @@ interface Match { summary: string; } +const apiUrl = process.env.API_URL; + export const handleSearch = async ( query: string, sortField: string, @@ -19,7 +21,7 @@ export const handleSearch = async ( setError(""); try { const response = await axios.post( - "http://localhost:8000/search", + `${apiUrl}/search`, { query: query, }, @@ -45,9 +47,8 @@ export const sortResults = ( direction: string, ): Match[] => { return [...data].sort((a, b) => { - // @ts-ignore - if (a[field] < b[field]) return direction === "asc" ? -1 : 1; // @ts-ignore - if (a[field] > b[field]) return direction === "asc" ? 1 : -1; // @ts-ignore + if (a[field] < b[field]) return direction === "asc" ? -1 : 1; + if (a[field] > b[field]) return direction === "asc" ? 1 : -1; return 0; }); }; diff --git a/frontend/next.config.mjs b/frontend/next.config.mjs index 4678774..a28490c 100644 --- a/frontend/next.config.mjs +++ b/frontend/next.config.mjs @@ -1,4 +1,8 @@ /** @type {import('next').NextConfig} */ -const nextConfig = {}; +const nextConfig = { + env: { + API_URL: process.env.API_URL || "http://localhost:8000", + }, +}; export default nextConfig; diff --git a/pypi_scout/api/main.py b/pypi_scout/api/main.py index eb4870c..2876b89 100644 --- a/pypi_scout/api/main.py +++ b/pypi_scout/api/main.py @@ -22,6 +22,7 @@ origins = [ "http://localhost:3000", + "http://frontend-service:3000", ] app.add_middleware( From 78997cc0e89dba1ab4ff6bd286cd26fe34ac6d8d Mon Sep 17 00:00:00 2001 From: Florian Maas Date: Tue, 18 Jun 2024 10:08:40 +0200 Subject: [PATCH 02/22] fix --- frontend/app/utils/search.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/frontend/app/utils/search.ts b/frontend/app/utils/search.ts index d99e8f0..4077460 100644 --- a/frontend/app/utils/search.ts +++ b/frontend/app/utils/search.ts @@ -47,8 +47,9 @@ export const sortResults = ( direction: string, ): Match[] => { return [...data].sort((a, b) => { - if (a[field] < b[field]) return direction === "asc" ? -1 : 1; - if (a[field] > b[field]) return direction === "asc" ? 1 : -1; + // @ts-ignore + if (a[field] < b[field]) return direction === "asc" ? -1 : 1; // @ts-ignore + if (a[field] > b[field]) return direction === "asc" ? 1 : -1; // @ts-ignore return 0; }); }; From 526a6674b9ba1d1b894335fcbf232c4723e3f92b Mon Sep 17 00:00:00 2001 From: Florian Maas Date: Tue, 18 Jun 2024 11:03:49 +0200 Subject: [PATCH 03/22] next public api url --- frontend/app/utils/search.ts | 2 +- frontend/next.config.mjs | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/frontend/app/utils/search.ts b/frontend/app/utils/search.ts index 4077460..57f70cd 100644 --- a/frontend/app/utils/search.ts +++ b/frontend/app/utils/search.ts @@ -7,7 +7,7 @@ interface Match { summary: string; } -const apiUrl = process.env.API_URL; +const apiUrl = process.env.NEXT_PUBLIC_API_URL; export const handleSearch = async ( query: string, diff --git a/frontend/next.config.mjs b/frontend/next.config.mjs index a28490c..38cb6a6 100644 --- a/frontend/next.config.mjs +++ b/frontend/next.config.mjs @@ -1,7 +1,8 @@ /** @type {import('next').NextConfig} */ const nextConfig = { env: { - API_URL: process.env.API_URL || "http://localhost:8000", + NEXT_PUBLIC_API_URL: + process.env.NEXT_PUBLIC_API_URL || "http://localhost:8000", }, }; From 7a6f91df4b8569a00631949902a7fcaaf9ea3437 Mon Sep 17 00:00:00 2001 From: Florian Maas Date: Tue, 18 Jun 2024 22:18:11 +0200 Subject: [PATCH 04/22] build env var for api url --- .github/workflows/publish.yml | 2 ++ docker-compose.yml | 2 ++ frontend/.dockerignore | 5 +++++ frontend/Dockerfile | 6 ++++++ 4 files changed, 15 insertions(+) create mode 100644 frontend/.dockerignore diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 76b5b7e..894cd6c 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -37,3 +37,5 @@ jobs: platforms: linux/amd64 push: true tags: pypiscoutacr.azurecr.io/pypi-scout-frontend:latest + build-args: | + NEXT_PUBLIC_API_URL=http://backend-service:8000 diff --git a/docker-compose.yml b/docker-compose.yml index f106db6..36c2a13 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,6 +18,8 @@ services: build: context: ./frontend dockerfile: Dockerfile + args: + NEXT_PUBLIC_API_URL: http://localhost:8000 ports: - "3000:3000" environment: diff --git a/frontend/.dockerignore b/frontend/.dockerignore new file mode 100644 index 0000000..8fd9192 --- /dev/null +++ b/frontend/.dockerignore @@ -0,0 +1,5 @@ +# .dockerignore +node_modules +.next +.env +.git diff --git a/frontend/Dockerfile b/frontend/Dockerfile index 290190a..ac7d507 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -13,6 +13,12 @@ RUN npm install # Copy the rest of the application code to the container COPY . . +# Build argument to accept the API URL during build time +ARG NEXT_PUBLIC_API_URL + +# Set environment variable within the container +ENV NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL} + # Build the Next.js application RUN npm run build From 300aea56d117c6a8342a665410090722899c5c76 Mon Sep 17 00:00:00 2001 From: Florian Maas Date: Wed, 19 Jun 2024 16:40:43 +0200 Subject: [PATCH 05/22] temporary all origins --- docker-compose.yml | 3 --- pypi_scout/api/main.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 36c2a13..e5ed2fb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,7 +5,6 @@ services: build: context: . dockerfile: Dockerfile - working_dir: / command: uvicorn pypi_scout.api.main:app --host 0.0.0.0 --port 8000 ports: - "8000:8000" @@ -22,7 +21,5 @@ services: NEXT_PUBLIC_API_URL: http://localhost:8000 ports: - "3000:3000" - environment: - - NODE_ENV=production depends_on: - backend diff --git a/pypi_scout/api/main.py b/pypi_scout/api/main.py index 2876b89..66da31e 100644 --- a/pypi_scout/api/main.py +++ b/pypi_scout/api/main.py @@ -27,7 +27,7 @@ app.add_middleware( CORSMiddleware, - allow_origins=origins, + allow_origins=["*"], # Temporary wildcard for testing allow_credentials=True, allow_methods=["*"], allow_headers=["*"], From 8371522176a39a391b8afabfbca9dd426e275080 Mon Sep 17 00:00:00 2001 From: Florian Maas Date: Wed, 19 Jun 2024 21:51:00 +0200 Subject: [PATCH 06/22] nginx-service api --- .github/workflows/publish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 894cd6c..687b47f 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -38,4 +38,4 @@ jobs: push: true tags: pypiscoutacr.azurecr.io/pypi-scout-frontend:latest build-args: | - NEXT_PUBLIC_API_URL=http://backend-service:8000 + NEXT_PUBLIC_API_URL=http://nginx-service/api From b69a5666085ad0bf7c2bcc0ec0880b339947fad3 Mon Sep 17 00:00:00 2001 From: Florian Maas Date: Thu, 20 Jun 2024 16:04:15 +0200 Subject: [PATCH 07/22] add host to frontend --- .github/workflows/publish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 687b47f..31ad085 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -38,4 +38,4 @@ jobs: push: true tags: pypiscoutacr.azurecr.io/pypi-scout-frontend:latest build-args: | - NEXT_PUBLIC_API_URL=http://nginx-service/api + NEXT_PUBLIC_API_URL=${{ env.HOST }}/api From 9aea52e71d05d9c5c2fc48985a65d11ff6f0f955 Mon Sep 17 00:00:00 2001 From: Florian Maas Date: Thu, 20 Jun 2024 20:01:29 +0200 Subject: [PATCH 08/22] add api to url --- frontend/next.config.mjs | 2 +- pypi_scout/api/main.py | 7 +------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/frontend/next.config.mjs b/frontend/next.config.mjs index 38cb6a6..ee2db8a 100644 --- a/frontend/next.config.mjs +++ b/frontend/next.config.mjs @@ -2,7 +2,7 @@ const nextConfig = { env: { NEXT_PUBLIC_API_URL: - process.env.NEXT_PUBLIC_API_URL || "http://localhost:8000", + process.env.NEXT_PUBLIC_API_URL || "http://localhost:8000/api", }, }; diff --git a/pypi_scout/api/main.py b/pypi_scout/api/main.py index 66da31e..4f2bb04 100644 --- a/pypi_scout/api/main.py +++ b/pypi_scout/api/main.py @@ -20,11 +20,6 @@ load_dotenv() config = Config() -origins = [ - "http://localhost:3000", - "http://frontend-service:3000", -] - app.add_middleware( CORSMiddleware, allow_origins=["*"], # Temporary wildcard for testing @@ -61,7 +56,7 @@ class SearchResponse(BaseModel): matches: list[Match] -@app.post("/search/", response_model=SearchResponse) +@app.post("/api/search/", response_model=SearchResponse) async def search(query: QueryModel): """ Search for the packages whose summary and description have the highest similarity to the query. From 2b95f7d34eda19935f92112454c6e79fb376ef30 Mon Sep 17 00:00:00 2001 From: Florian Maas Date: Thu, 20 Jun 2024 20:09:19 +0200 Subject: [PATCH 09/22] add numpy --- requirements-cpu.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-cpu.txt b/requirements-cpu.txt index 26efca3..7df01fe 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -12,5 +12,6 @@ pydantic==2.7.4 uvicorn==0.30.1 gdown==5.2.0 torch==2.0.1 +numpy==1.24.4 --index-url=https://download.pytorch.org/whl/cpu --extra-index-url=https://pypi.org/simple From d9cc11efc7e01355fde87c25ae0284046033d419 Mon Sep 17 00:00:00 2001 From: Florian Maas Date: Thu, 20 Jun 2024 20:13:30 +0200 Subject: [PATCH 10/22] build and push parallel --- .github/workflows/publish.yml | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 31ad085..991882e 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -4,7 +4,7 @@ on: workflow_dispatch: jobs: - push-to-acr: + build-and-push-backend: runs-on: ubuntu-latest steps: - name: Checkout repository @@ -29,6 +29,22 @@ jobs: push: true tags: pypiscoutacr.azurecr.io/pypi-scout-backend:latest + build-and-push-frontend: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to Azure Container Registry + uses: azure/docker-login@v1 + with: + login-server: pypiscoutacr.azurecr.io + username: ${{ secrets.ACR_USERNAME }} + password: ${{ secrets.ACR_PASSWORD }} + - name: Build and Push Frontend Docker image uses: docker/build-push-action@v4 with: From d83bb6ac68b492567f5666bc2074003a67644994 Mon Sep 17 00:00:00 2001 From: Florian Maas Date: Fri, 21 Jun 2024 12:45:30 +0200 Subject: [PATCH 11/22] wip; download blob backend --- poetry.lock | 111 ++++++++++++++++++++++++- pypi_scout/config.py | 27 ++++++ pypi_scout/scripts/download_dataset.py | 33 +++++++- pypi_scout/scripts/process_dataset.py | 53 ++++++++++-- pypi_scout/utils/blob_io.py | 38 +++++++++ pypi_scout/utils/logging.py | 2 + pyproject.toml | 1 + 7 files changed, 256 insertions(+), 9 deletions(-) create mode 100644 pypi_scout/utils/blob_io.py diff --git a/poetry.lock b/poetry.lock index bf12617..110e780 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.0 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "annotated-types" @@ -189,6 +189,45 @@ tests = ["attrs[tests-no-zope]", "zope-interface"] tests-mypy = ["mypy (>=1.6)", "pytest-mypy-plugins"] tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-xdist[psutil]"] +[[package]] +name = "azure-core" +version = "1.30.2" +description = "Microsoft Azure Core Library for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "azure-core-1.30.2.tar.gz", hash = "sha256:a14dc210efcd608821aa472d9fb8e8d035d29b68993819147bc290a8ac224472"}, + {file = "azure_core-1.30.2-py3-none-any.whl", hash = "sha256:cf019c1ca832e96274ae85abd3d9f752397194d9fea3b41487290562ac8abe4a"}, +] + +[package.dependencies] +requests = ">=2.21.0" +six = ">=1.11.0" +typing-extensions = ">=4.6.0" + +[package.extras] +aio = ["aiohttp (>=3.0)"] + +[[package]] +name = "azure-storage-blob" +version = "12.20.0" +description = "Microsoft Azure Blob Storage Client Library for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "azure-storage-blob-12.20.0.tar.gz", hash = "sha256:eeb91256e41d4b5b9bad6a87fd0a8ade07dd58aa52344e2c8d2746e27a017d3b"}, + {file = "azure_storage_blob-12.20.0-py3-none-any.whl", hash = "sha256:de6b3bf3a90e9341a6bcb96a2ebe981dffff993e9045818f6549afea827a52a9"}, +] + +[package.dependencies] +azure-core = ">=1.28.0" +cryptography = ">=2.1.4" +isodate = ">=0.6.1" +typing-extensions = ">=4.6.0" + +[package.extras] +aio = ["azure-core[aio] (>=1.28.0)"] + [[package]] name = "babel" version = "2.15.0" @@ -572,6 +611,60 @@ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.1 [package.extras] toml = ["tomli"] +[[package]] +name = "cryptography" +version = "42.0.8" +description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." +optional = false +python-versions = ">=3.7" +files = [ + {file = "cryptography-42.0.8-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:81d8a521705787afe7a18d5bfb47ea9d9cc068206270aad0b96a725022e18d2e"}, + {file = "cryptography-42.0.8-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:961e61cefdcb06e0c6d7e3a1b22ebe8b996eb2bf50614e89384be54c48c6b63d"}, + {file = "cryptography-42.0.8-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3ec3672626e1b9e55afd0df6d774ff0e953452886e06e0f1eb7eb0c832e8902"}, + {file = "cryptography-42.0.8-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e599b53fd95357d92304510fb7bda8523ed1f79ca98dce2f43c115950aa78801"}, + {file = "cryptography-42.0.8-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:5226d5d21ab681f432a9c1cf8b658c0cb02533eece706b155e5fbd8a0cdd3949"}, + {file = "cryptography-42.0.8-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:6b7c4f03ce01afd3b76cf69a5455caa9cfa3de8c8f493e0d3ab7d20611c8dae9"}, + {file = "cryptography-42.0.8-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:2346b911eb349ab547076f47f2e035fc8ff2c02380a7cbbf8d87114fa0f1c583"}, + {file = "cryptography-42.0.8-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:ad803773e9df0b92e0a817d22fd8a3675493f690b96130a5e24f1b8fabbea9c7"}, + {file = "cryptography-42.0.8-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:2f66d9cd9147ee495a8374a45ca445819f8929a3efcd2e3df6428e46c3cbb10b"}, + {file = "cryptography-42.0.8-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d45b940883a03e19e944456a558b67a41160e367a719833c53de6911cabba2b7"}, + {file = "cryptography-42.0.8-cp37-abi3-win32.whl", hash = "sha256:a0c5b2b0585b6af82d7e385f55a8bc568abff8923af147ee3c07bd8b42cda8b2"}, + {file = "cryptography-42.0.8-cp37-abi3-win_amd64.whl", hash = "sha256:57080dee41209e556a9a4ce60d229244f7a66ef52750f813bfbe18959770cfba"}, + {file = "cryptography-42.0.8-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:dea567d1b0e8bc5764b9443858b673b734100c2871dc93163f58c46a97a83d28"}, + {file = "cryptography-42.0.8-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4783183f7cb757b73b2ae9aed6599b96338eb957233c58ca8f49a49cc32fd5e"}, + {file = "cryptography-42.0.8-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0608251135d0e03111152e41f0cc2392d1e74e35703960d4190b2e0f4ca9c70"}, + {file = "cryptography-42.0.8-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:dc0fdf6787f37b1c6b08e6dfc892d9d068b5bdb671198c72072828b80bd5fe4c"}, + {file = "cryptography-42.0.8-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:9c0c1716c8447ee7dbf08d6db2e5c41c688544c61074b54fc4564196f55c25a7"}, + {file = "cryptography-42.0.8-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:fff12c88a672ab9c9c1cf7b0c80e3ad9e2ebd9d828d955c126be4fd3e5578c9e"}, + {file = "cryptography-42.0.8-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:cafb92b2bc622cd1aa6a1dce4b93307792633f4c5fe1f46c6b97cf67073ec961"}, + {file = "cryptography-42.0.8-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:31f721658a29331f895a5a54e7e82075554ccfb8b163a18719d342f5ffe5ecb1"}, + {file = "cryptography-42.0.8-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b297f90c5723d04bcc8265fc2a0f86d4ea2e0f7ab4b6994459548d3a6b992a14"}, + {file = "cryptography-42.0.8-cp39-abi3-win32.whl", hash = "sha256:2f88d197e66c65be5e42cd72e5c18afbfae3f741742070e3019ac8f4ac57262c"}, + {file = "cryptography-42.0.8-cp39-abi3-win_amd64.whl", hash = "sha256:fa76fbb7596cc5839320000cdd5d0955313696d9511debab7ee7278fc8b5c84a"}, + {file = "cryptography-42.0.8-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:ba4f0a211697362e89ad822e667d8d340b4d8d55fae72cdd619389fb5912eefe"}, + {file = "cryptography-42.0.8-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:81884c4d096c272f00aeb1f11cf62ccd39763581645b0812e99a91505fa48e0c"}, + {file = "cryptography-42.0.8-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c9bb2ae11bfbab395bdd072985abde58ea9860ed84e59dbc0463a5d0159f5b71"}, + {file = "cryptography-42.0.8-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:7016f837e15b0a1c119d27ecd89b3515f01f90a8615ed5e9427e30d9cdbfed3d"}, + {file = "cryptography-42.0.8-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5a94eccb2a81a309806027e1670a358b99b8fe8bfe9f8d329f27d72c094dde8c"}, + {file = "cryptography-42.0.8-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:dec9b018df185f08483f294cae6ccac29e7a6e0678996587363dc352dc65c842"}, + {file = "cryptography-42.0.8-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:343728aac38decfdeecf55ecab3264b015be68fc2816ca800db649607aeee648"}, + {file = "cryptography-42.0.8-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:013629ae70b40af70c9a7a5db40abe5d9054e6f4380e50ce769947b73bf3caad"}, + {file = "cryptography-42.0.8.tar.gz", hash = "sha256:8d09d05439ce7baa8e9e95b07ec5b6c886f548deb7e0f69ef25f64b3bce842f2"}, +] + +[package.dependencies] +cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""} + +[package.extras] +docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] +docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"] +nox = ["nox"] +pep8test = ["check-sdist", "click", "mypy", "ruff"] +sdist = ["build"] +ssh = ["bcrypt (>=3.1.5)"] +test = ["certifi", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] +test-randomorder = ["pytest-randomly"] + [[package]] name = "debugpy" version = "1.8.1" @@ -1190,6 +1283,20 @@ qtconsole = ["qtconsole"] test = ["pytest (<7.1)", "pytest-asyncio", "testpath"] test-extra = ["curio", "matplotlib (!=3.2.0)", "nbformat", "numpy (>=1.21)", "pandas", "pytest (<7.1)", "pytest-asyncio", "testpath", "trio"] +[[package]] +name = "isodate" +version = "0.6.1" +description = "An ISO 8601 date/time/duration parser and formatter" +optional = false +python-versions = "*" +files = [ + {file = "isodate-0.6.1-py2.py3-none-any.whl", hash = "sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96"}, + {file = "isodate-0.6.1.tar.gz", hash = "sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9"}, +] + +[package.dependencies] +six = "*" + [[package]] name = "isoduration" version = "20.11.0" @@ -4894,4 +5001,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.8,<4.0" -content-hash = "94b4710b0b5d989f54293ad80f03d6fd811a76b27f1d9ab6253b36971f769f58" +content-hash = "35d15e157357093f2ed35f9cae6ce7e9cb78f2d8592161a4939141ae437bd84a" diff --git a/pypi_scout/config.py b/pypi_scout/config.py index e4d3a48..ed23fca 100644 --- a/pypi_scout/config.py +++ b/pypi_scout/config.py @@ -1,8 +1,14 @@ import os from dataclasses import dataclass, field +from enum import Enum from pathlib import Path +class StorageBackend(Enum): + LOCAL = "LOCAL" + BLOB = "BLOB" + + @dataclass class Config: # Name of the Pinecone index used for storing vector representations of the package descriptions. @@ -47,6 +53,27 @@ class Config: WEIGHT_SIMILARITY = 0.8 WEIGHT_WEEKLY_DOWNLOADS = 0.2 + # Storage backend + STORAGE_BACKEND: StorageBackend = StorageBackend.LOCAL + STORAGE_BACKEND_BLOB_ACCOUNT_NAME: str | None = None + STORAGE_BACKEND_BLOB_CONTAINER_NAME: str | None = None + STORAGE_BACKEND_BLOB_KEY: str | None = None + def __post_init__(self) -> None: if not self.PINECONE_TOKEN: raise OSError("PINECONE_TOKEN not found in environment variables") # noqa: TRY003 + + if os.getenv("STORAGE_BACKEND") == "BLOB": + self.STORAGE_BACKEND = StorageBackend.BLOB + self.STORAGE_BACKEND_BLOB_ACCOUNT_NAME = os.getenv("STORAGE_BACKEND_BLOB_ACCOUNT_NAME") + self.STORAGE_BACKEND_BLOB_CONTAINER_NAME = os.getenv("STORAGE_BACKEND_BLOB_CONTAINER_NAME") + self.STORAGE_BACKEND_BLOB_KEY = os.getenv("STORAGE_BACKEND_BLOB_KEY") + + if not all( + [ + self.STORAGE_BACKEND_BLOB_ACCOUNT_NAME, + self.STORAGE_BACKEND_BLOB_CONTAINER_NAME, + self.STORAGE_BACKEND_BLOB_KEY, + ] + ): + raise OSError("One or more BLOB storage environment variables are missing!") diff --git a/pypi_scout/scripts/download_dataset.py b/pypi_scout/scripts/download_dataset.py index 58a0c94..3f5c667 100644 --- a/pypi_scout/scripts/download_dataset.py +++ b/pypi_scout/scripts/download_dataset.py @@ -3,7 +3,8 @@ import gdown from dotenv import load_dotenv -from pypi_scout.config import Config +from pypi_scout.config import Config, StorageBackend +from pypi_scout.utils.blob_io import BlobIO from pypi_scout.utils.logging import setup_logging @@ -14,6 +15,13 @@ def download_dataset(): load_dotenv() config = Config() + if config.STORAGE_BACKEND == StorageBackend.LOCAL: + handle_for_local_backend(config) + else: + handle_for_blob_backend(config) + + +def handle_for_local_backend(config: Config): target_path = config.DATA_DIR / config.RAW_DATASET_CSV_NAME if target_path.exists(): logging.info(f"✔️ Raw dataset {target_path} from Google Drive already exists! Skipping download.") @@ -25,6 +33,29 @@ def download_dataset(): logging.info("✅ Done!") +def handle_for_blob_backend(config: Config): + blob_io = BlobIO( + config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME, + config.STORAGE_BACKEND_BLOB_CONTAINER_NAME, + config.STORAGE_BACKEND_BLOB_KEY, + ) + + if blob_io.exists(config.RAW_DATASET_CSV_NAME): + logging.info( + f"✔️ Raw dataset {config.RAW_DATASET_CSV_NAME} already exists in container '{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}'! Skipping download." + ) + return + + temp_target_path = config.DATA_DIR / config.RAW_DATASET_CSV_NAME + logging.info("⬇️ Downloading raw dataset from Google Drive to temporary file...") + url = f"https://drive.google.com/uc?id={config.GOOGLE_FILE_ID}" + gdown.download(url, str(temp_target_path), quiet=False) + + logging.info("Downloading done, now uploading to Blob...") + blob_io.upload_local_csv(temp_target_path, config.RAW_DATASET_CSV_NAME) + logging.info("✅ Done!") + + if __name__ == "__main__": setup_logging() download_dataset() diff --git a/pypi_scout/scripts/process_dataset.py b/pypi_scout/scripts/process_dataset.py index 8bc0473..dedc8c0 100644 --- a/pypi_scout/scripts/process_dataset.py +++ b/pypi_scout/scripts/process_dataset.py @@ -1,11 +1,13 @@ import logging +from pathlib import Path import polars as pl from dotenv import load_dotenv -from pypi_scout.config import Config +from pypi_scout.config import Config, StorageBackend from pypi_scout.data.description_cleaner import CLEANING_FAILED, DescriptionCleaner from pypi_scout.data.reader import DataReader +from pypi_scout.utils.blob_io import BlobIO from pypi_scout.utils.logging import setup_logging @@ -42,20 +44,59 @@ def clean_descriptions(df): return df -def store_processed_dataset(df, processed_dataset_path): +def store_processed_dataset_local(df: pl.DataFrame, processed_dataset_path: Path): logging.info("Storing the processed dataset...") df.write_csv(processed_dataset_path) logging.info("✅ Done!") -def process_dataset(): - load_dotenv() - config = Config() +def store_processed_dataset_blob(df: pl.DataFrame, blob_io: BlobIO, blob_name: str): + logging.info(f"Storing the processed dataset as {blob_name} in container '{blob_io.container_name}'...") + blob_io.upload_csv(df, blob_name) + logging.info("✅ Done!") + + +def handle_for_local_backend(config: Config): + if (config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME).exists(): + logging.info(f"✔️ Processed dataset {config.PROCESSED_DATASET_CSV_NAME} already exists! Skipping.") + return + df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME) if config.FRAC_DATA_TO_INCLUDE < 1.0: df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE) df = clean_descriptions(df) - store_processed_dataset(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME) + + store_processed_dataset_local(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME) + + +def handle_for_blob_backend(config: Config): + blob_io = BlobIO( + config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME, + config.STORAGE_BACKEND_BLOB_CONTAINER_NAME, + config.STORAGE_BACKEND_BLOB_KEY, + ) + + if blob_io.exists(config.PROCESSED_DATASET_CSV_NAME): + logging.info( + f"✔️ Raw dataset {config.PROCESSED_DATASET_CSV_NAME} already exists in container '{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}'! Skipping download." + ) + return + + df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME) + if config.FRAC_DATA_TO_INCLUDE < 1.0: + df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE) + df = clean_descriptions(df) + + store_processed_dataset_blob(df, blob_io, config.PROCESSED_DATASET_CSV_NAME) + + +def process_dataset(): + load_dotenv() + config = Config() + if config.STORAGE_BACKEND == StorageBackend.LOCAL: + handle_for_local_backend(config) + else: + handle_for_blob_backend(config) if __name__ == "__main__": diff --git a/pypi_scout/utils/blob_io.py b/pypi_scout/utils/blob_io.py new file mode 100644 index 0000000..28d0078 --- /dev/null +++ b/pypi_scout/utils/blob_io.py @@ -0,0 +1,38 @@ +from io import BytesIO + +import polars as pl +from azure.storage.blob import BlobServiceClient + + +class BlobIO: + def __init__(self, account_name: str, container_name: str, account_key: str): + self.account_name = account_name + self.container_name = container_name + self.account_key = account_key + self.service_client = BlobServiceClient( + account_url=f"https://{account_name}.blob.core.windows.net", credential=account_key + ) + self.container_client = self.service_client.get_container_client(container_name) + + def upload_csv(self, data_frame: pl.DataFrame, blob_name: str) -> None: + csv_buffer = BytesIO() + data_frame.write_csv(csv_buffer) + csv_buffer.seek(0) # Reset buffer position to the beginning + blob_client = self.container_client.get_blob_client(blob_name) + blob_client.upload_blob(csv_buffer, overwrite=True) + + def upload_local_csv(self, local_file_path: str, blob_name: str) -> None: + with open(local_file_path, "rb") as data: + blob_client = self.container_client.get_blob_client(blob_name) + blob_client.upload_blob(data, overwrite=True) + + def download_csv(self, blob_name: str) -> pl.DataFrame: + blob_client = self.container_client.get_blob_client(blob_name) + download_stream = blob_client.download_blob() + csv_content = download_stream.content_as_text() + csv_buffer = StringIO(csv_content) + return pl.read_csv(csv_buffer) + + def exists(self, blob_name): + blob_client = self.container_client.get_blob_client(blob_name) + return blob_client.exists() diff --git a/pypi_scout/utils/logging.py b/pypi_scout/utils/logging.py index 2498547..997cace 100644 --- a/pypi_scout/utils/logging.py +++ b/pypi_scout/utils/logging.py @@ -2,6 +2,8 @@ def setup_logging() -> None: + logging.getLogger("azure").setLevel(logging.WARNING) + logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", diff --git a/pyproject.toml b/pyproject.toml index de2e2fe..24d29fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ fastapi = "^0.111.0" pydantic = "^2.7.4" uvicorn = "^0.30.1" gdown = "^5.2.0" +azure-storage-blob = "^12.20.0" [tool.poetry.group.dev.dependencies] pytest = "^7.2.0" From d77543b0b065340ee8c01b8310b6e9816ec4b552 Mon Sep 17 00:00:00 2001 From: Florian Maas Date: Fri, 21 Jun 2024 15:06:23 +0200 Subject: [PATCH 12/22] remove start script, add blob backend --- .DS_Store | Bin 0 -> 6148 bytes Dockerfile | 6 +- DockerfileCPU | 6 +- docker-compose.yml | 5 +- pypi_scout/api/main.py | 2 +- pypi_scout/api/utils.py | 33 ++++++++-- pypi_scout/config.py | 10 ++- .../data/{reader.py => raw_data_reader.py} | 2 +- pypi_scout/scripts/download_dataset.py | 61 ------------------ pypi_scout/scripts/download_raw_dataset.py | 35 ++++++++++ ...cess_dataset.py => process_raw_dataset.py} | 59 +++-------------- pypi_scout/scripts/setup.py | 20 ++++-- pypi_scout/scripts/setup_pinecone.py | 2 +- .../scripts/upload_processed_dataset.py | 58 +++++++++++++++++ pypi_scout/utils/blob_io.py | 15 ++++- start.sh | 4 -- 16 files changed, 174 insertions(+), 144 deletions(-) create mode 100644 .DS_Store rename pypi_scout/data/{reader.py => raw_data_reader.py} (97%) delete mode 100644 pypi_scout/scripts/download_dataset.py create mode 100644 pypi_scout/scripts/download_raw_dataset.py rename pypi_scout/scripts/{process_dataset.py => process_raw_dataset.py} (53%) create mode 100644 pypi_scout/scripts/upload_processed_dataset.py delete mode 100755 start.sh diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..692e3682b622e099b02943a17317dec49f01965a GIT binary patch literal 6148 zcmeHK%}N6?5Kii9g+1#F`2?Ok z`v5+PGfAo~RgY3+1}0xJKS}o6CD~z&@oG0LGge@X2~fmb9-40i{iqX?vz9qPj&lUP zPAiC20ORM0XfjMA1NiP77BZiu%pZM!K`V|rX{q$ab2HBD+`O0<3*ySVQ#~*B({88g zw=ZyXqEs9V`+jg1HG1{J@}WvnKS~;Hoe+f$2)R6ul2G-ks+)uzo$DEg$cucvu-fle zN|mDADQ^#ovcI=gD$2_C?qHA?E9;y4$MyTTnW#rYB8Try%c8|0JVEF4dE(o$ccYpu zbe^BCuZ{Yes#de#s>t4*jn}1@gK>ZOtG{(OGIx3nsbA|pcK@1QBQ^NqB4-yp7wU`;yM+Y>z1ORlvtOfd5OJI()=vvGSf(L|~ zR6vu;ZHd86I@qO+b1h~DO*-SY_~3SBZYvb7SBLqf4rkmoNG&ly42&}{V}=F1|BrsI z|HqSPL<|rE|B3S8conaLY5}{%4xno> SGYA$C`Vr7HP(uv-DFYw=vt|1L literal 0 HcmV?d00001 diff --git a/Dockerfile b/Dockerfile index 66c2eaa..7c271b1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,14 +22,10 @@ RUN poetry install --no-interaction --no-ansi --no-root --no-dev && \ # Copy Python code to the Docker image COPY pypi_scout /code/pypi_scout/ -# Copy the start script and make executable -COPY start.sh /start.sh -RUN chmod +x /start.sh - # Make empty data directory RUN mkdir -p /code/data ENV PYTHONPATH=/code # Use the script as the entrypoint -ENTRYPOINT ["/start.sh"] +CMD ["uvicorn", "pypi_scout.api.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/DockerfileCPU b/DockerfileCPU index 7f391b1..775c2a5 100644 --- a/DockerfileCPU +++ b/DockerfileCPU @@ -23,14 +23,10 @@ RUN pip install --no-cache-dir -r requirements-cpu.txt # Copy the rest of the application code COPY pypi_scout /code/pypi_scout/ -# Copy the start script and make it executable -COPY start.sh /start.sh -RUN chmod +x /start.sh - # Make empty data directory RUN mkdir -p /code/data ENV PYTHONPATH=/code # Use the script as the entrypoint -ENTRYPOINT ["/start.sh"] +CMD ["uvicorn", "pypi_scout.api.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/docker-compose.yml b/docker-compose.yml index e5ed2fb..83c37d9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,11 +5,10 @@ services: build: context: . dockerfile: Dockerfile - command: uvicorn pypi_scout.api.main:app --host 0.0.0.0 --port 8000 ports: - "8000:8000" volumes: - - ./data:/data + - ./data:/code/data env_file: - .env @@ -18,7 +17,7 @@ services: context: ./frontend dockerfile: Dockerfile args: - NEXT_PUBLIC_API_URL: http://localhost:8000 + NEXT_PUBLIC_API_URL: http://localhost:8000/api ports: - "3000:3000" depends_on: diff --git a/pypi_scout/api/main.py b/pypi_scout/api/main.py index 4f2bb04..82e22a7 100644 --- a/pypi_scout/api/main.py +++ b/pypi_scout/api/main.py @@ -28,7 +28,7 @@ allow_headers=["*"], ) -df = load_dataset(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME) +df = load_dataset(config) model = SentenceTransformer(config.EMBEDDINGS_MODEL_NAME) diff --git a/pypi_scout/api/utils.py b/pypi_scout/api/utils.py index cab3e58..64162d3 100644 --- a/pypi_scout/api/utils.py +++ b/pypi_scout/api/utils.py @@ -1,12 +1,37 @@ import logging -from pathlib import Path +import sys import polars as pl +from pypi_scout.config import Config, StorageBackend +from pypi_scout.utils.blob_io import BlobIO + + +def load_dataset(config: Config) -> pl.DataFrame: + dataset_path = config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME + + if dataset_path.exists(): + logging.info(f"Found local dataset. Reading dataset from `{dataset_path}`...") + df = pl.read_csv(dataset_path) + + elif config.STORAGE_BACKEND == StorageBackend.BLOB: + logging.info( + f"Downloading `{config.PROCESSED_DATASET_CSV_NAME}` from container `{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}`..." + ) + blob_io = BlobIO( + config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME, + config.STORAGE_BACKEND_BLOB_CONTAINER_NAME, + config.STORAGE_BACKEND_BLOB_KEY, + ) + df = blob_io.download_csv(config.PROCESSED_DATASET_CSV_NAME) + logging.info("Finished downloading.") + + else: + logging.error( + f"Dataset {dataset_path} not found, and config.StorageBackend is not `BLOB` so can't download the dataset from Azure. Terminating." + ) + sys.exit(1) -def load_dataset(path_to_dataset: Path): - logging.info("Loading the processed dataset...") - df = pl.read_csv(path_to_dataset) logging.info(f"Finished loading the processed dataset. Number of rows: {len(df):,}") logging.info(f"The highest weekly downloads in the dataset: {df['weekly_downloads'].max():,}") logging.info(f"The lowest weekly downloads in the dataset: {df['weekly_downloads'].min():,}") diff --git a/pypi_scout/config.py b/pypi_scout/config.py index ed23fca..ee19d89 100644 --- a/pypi_scout/config.py +++ b/pypi_scout/config.py @@ -27,6 +27,9 @@ class Config: # Dimension of the vector embeddings produced by the model. Should match the output of the model above. EMBEDDINGS_DIMENSION = 768 + # Boolean to overwrite existing files. e.g. re-download the raw dataset, upload processed dataset to blob, etc. + OVERWRITE: bool = True + # Directory where dataset files are stored. DATA_DIR: Path = Path("data") @@ -53,7 +56,10 @@ class Config: WEIGHT_SIMILARITY = 0.8 WEIGHT_WEEKLY_DOWNLOADS = 0.2 - # Storage backend + # Storage backend configuration. Can be either StorageBackend.LOCAL or StorageBackend.BLOB. + # If StorageBackend.BLOB, the processed dataset will be uploaded to Blob, and the backend API + # will read the data from there, rather than from a local data directory. In order to use StorageBackend.BLOB, + # the other `STORAGE_BACKEND_BLOB_` variables need to be set as environment variables. STORAGE_BACKEND: StorageBackend = StorageBackend.LOCAL STORAGE_BACKEND_BLOB_ACCOUNT_NAME: str | None = None STORAGE_BACKEND_BLOB_CONTAINER_NAME: str | None = None @@ -76,4 +82,4 @@ def __post_init__(self) -> None: self.STORAGE_BACKEND_BLOB_KEY, ] ): - raise OSError("One or more BLOB storage environment variables are missing!") + raise OSError("One or more BLOB storage environment variables are missing!") # noqa: TRY003 diff --git a/pypi_scout/data/reader.py b/pypi_scout/data/raw_data_reader.py similarity index 97% rename from pypi_scout/data/reader.py rename to pypi_scout/data/raw_data_reader.py index 4a31e9f..f0de2ba 100644 --- a/pypi_scout/data/reader.py +++ b/pypi_scout/data/raw_data_reader.py @@ -5,7 +5,7 @@ @dataclass -class DataReader: +class RawDataReader: """ A class for reading and processing data from a raw PyPI dataset. """ diff --git a/pypi_scout/scripts/download_dataset.py b/pypi_scout/scripts/download_dataset.py deleted file mode 100644 index 3f5c667..0000000 --- a/pypi_scout/scripts/download_dataset.py +++ /dev/null @@ -1,61 +0,0 @@ -import logging - -import gdown -from dotenv import load_dotenv - -from pypi_scout.config import Config, StorageBackend -from pypi_scout.utils.blob_io import BlobIO -from pypi_scout.utils.logging import setup_logging - - -def download_dataset(): - """ - Downloads the dataset from a Google Drive link using the gdown library. - """ - load_dotenv() - config = Config() - - if config.STORAGE_BACKEND == StorageBackend.LOCAL: - handle_for_local_backend(config) - else: - handle_for_blob_backend(config) - - -def handle_for_local_backend(config: Config): - target_path = config.DATA_DIR / config.RAW_DATASET_CSV_NAME - if target_path.exists(): - logging.info(f"✔️ Raw dataset {target_path} from Google Drive already exists! Skipping download.") - return - - logging.info(f"⬇️ Downloading raw dataset from Google Drive to {target_path}...") - url = f"https://drive.google.com/uc?id={config.GOOGLE_FILE_ID}" - gdown.download(url, str(target_path), quiet=False) - logging.info("✅ Done!") - - -def handle_for_blob_backend(config: Config): - blob_io = BlobIO( - config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME, - config.STORAGE_BACKEND_BLOB_CONTAINER_NAME, - config.STORAGE_BACKEND_BLOB_KEY, - ) - - if blob_io.exists(config.RAW_DATASET_CSV_NAME): - logging.info( - f"✔️ Raw dataset {config.RAW_DATASET_CSV_NAME} already exists in container '{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}'! Skipping download." - ) - return - - temp_target_path = config.DATA_DIR / config.RAW_DATASET_CSV_NAME - logging.info("⬇️ Downloading raw dataset from Google Drive to temporary file...") - url = f"https://drive.google.com/uc?id={config.GOOGLE_FILE_ID}" - gdown.download(url, str(temp_target_path), quiet=False) - - logging.info("Downloading done, now uploading to Blob...") - blob_io.upload_local_csv(temp_target_path, config.RAW_DATASET_CSV_NAME) - logging.info("✅ Done!") - - -if __name__ == "__main__": - setup_logging() - download_dataset() diff --git a/pypi_scout/scripts/download_raw_dataset.py b/pypi_scout/scripts/download_raw_dataset.py new file mode 100644 index 0000000..e10a81f --- /dev/null +++ b/pypi_scout/scripts/download_raw_dataset.py @@ -0,0 +1,35 @@ +import logging + +import gdown +from dotenv import load_dotenv + +from pypi_scout.config import Config +from pypi_scout.utils.logging import setup_logging + + +def download_raw_dataset(): + """ + Downloads the dataset from a Google Drive link using the gdown library. + """ + load_dotenv() + config = Config() + + target_path = config.DATA_DIR / config.RAW_DATASET_CSV_NAME + if target_path.exists(): + if not config.OVERWRITE: + logging.info(f"🔹 Raw dataset {target_path} from Google Drive already exists! Skipping download.") + return + else: + logging.info( + f"⤵️ Raw dataset {target_path} from Google Drive exists, but config.OVERWRITE is `true`. Overwriting..." + ) + + logging.info(f"⬇️ Downloading raw dataset from Google Drive to {target_path}...") + url = f"https://drive.google.com/uc?id={config.GOOGLE_FILE_ID}" + gdown.download(url, str(target_path), quiet=False) + logging.info("✅ Done!") + + +if __name__ == "__main__": + setup_logging() + download_raw_dataset() diff --git a/pypi_scout/scripts/process_dataset.py b/pypi_scout/scripts/process_raw_dataset.py similarity index 53% rename from pypi_scout/scripts/process_dataset.py rename to pypi_scout/scripts/process_raw_dataset.py index dedc8c0..61741b8 100644 --- a/pypi_scout/scripts/process_dataset.py +++ b/pypi_scout/scripts/process_raw_dataset.py @@ -1,19 +1,17 @@ import logging -from pathlib import Path import polars as pl from dotenv import load_dotenv -from pypi_scout.config import Config, StorageBackend +from pypi_scout.config import Config from pypi_scout.data.description_cleaner import CLEANING_FAILED, DescriptionCleaner -from pypi_scout.data.reader import DataReader -from pypi_scout.utils.blob_io import BlobIO +from pypi_scout.data.raw_data_reader import RawDataReader from pypi_scout.utils.logging import setup_logging def read_raw_dataset(path_to_raw_dataset): logging.info("📂 Reading the raw dataset...") - df = DataReader(path_to_raw_dataset).read() + df = RawDataReader(path_to_raw_dataset).read() logging.info("📊 Number of rows in the raw dataset: %s", len(df)) logging.info(f"The highest weekly downloads in the raw dataset: {df['weekly_downloads'].max():,}") logging.info(f"The lowest weekly downloads in the raw dataset: {df['weekly_downloads'].min():,}") @@ -44,61 +42,22 @@ def clean_descriptions(df): return df -def store_processed_dataset_local(df: pl.DataFrame, processed_dataset_path: Path): +def store_processed_dataset(df, processed_dataset_path): logging.info("Storing the processed dataset...") df.write_csv(processed_dataset_path) logging.info("✅ Done!") -def store_processed_dataset_blob(df: pl.DataFrame, blob_io: BlobIO, blob_name: str): - logging.info(f"Storing the processed dataset as {blob_name} in container '{blob_io.container_name}'...") - blob_io.upload_csv(df, blob_name) - logging.info("✅ Done!") - - -def handle_for_local_backend(config: Config): - if (config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME).exists(): - logging.info(f"✔️ Processed dataset {config.PROCESSED_DATASET_CSV_NAME} already exists! Skipping.") - return - - df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME) - if config.FRAC_DATA_TO_INCLUDE < 1.0: - df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE) - df = clean_descriptions(df) - - store_processed_dataset_local(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME) - - -def handle_for_blob_backend(config: Config): - blob_io = BlobIO( - config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME, - config.STORAGE_BACKEND_BLOB_CONTAINER_NAME, - config.STORAGE_BACKEND_BLOB_KEY, - ) - - if blob_io.exists(config.PROCESSED_DATASET_CSV_NAME): - logging.info( - f"✔️ Raw dataset {config.PROCESSED_DATASET_CSV_NAME} already exists in container '{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}'! Skipping download." - ) - return - +def process_raw_dataset(): + load_dotenv() + config = Config() df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME) if config.FRAC_DATA_TO_INCLUDE < 1.0: df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE) df = clean_descriptions(df) - - store_processed_dataset_blob(df, blob_io, config.PROCESSED_DATASET_CSV_NAME) - - -def process_dataset(): - load_dotenv() - config = Config() - if config.STORAGE_BACKEND == StorageBackend.LOCAL: - handle_for_local_backend(config) - else: - handle_for_blob_backend(config) + store_processed_dataset(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME) if __name__ == "__main__": setup_logging() - process_dataset() + process_raw_dataset() diff --git a/pypi_scout/scripts/setup.py b/pypi_scout/scripts/setup.py index 680b5ed..812e6a5 100644 --- a/pypi_scout/scripts/setup.py +++ b/pypi_scout/scripts/setup.py @@ -1,18 +1,30 @@ import argparse +import logging -from pypi_scout.scripts.download_dataset import download_dataset -from pypi_scout.scripts.process_dataset import process_dataset +from pypi_scout.scripts.download_raw_dataset import download_raw_dataset +from pypi_scout.scripts.process_raw_dataset import process_raw_dataset from pypi_scout.scripts.setup_pinecone import setup_pinecone +from pypi_scout.scripts.upload_processed_dataset import upload_processed_dataset from pypi_scout.scripts.upsert_data import upsert_data from pypi_scout.utils.logging import setup_logging def main(no_upsert): setup_logging() + + logging.info("\n\nSETTING UP PINECONE -------------\n") setup_pinecone() - download_dataset() - process_dataset() + + logging.info("\n\nDOWNLOADING RAW DATASET -------------\n") + download_raw_dataset() + + logging.info("\n\nPROCESSING RAW DATASET -------------\n") + process_raw_dataset() + + logging.info("\n\nUPLOADING PROCESSED DATASET -------------\n") + upload_processed_dataset() if not no_upsert: + logging.info("\n\nUPSERTING DATA TO PINECONE -------------\n") upsert_data() diff --git a/pypi_scout/scripts/setup_pinecone.py b/pypi_scout/scripts/setup_pinecone.py index c3d12f6..4f3b1f2 100644 --- a/pypi_scout/scripts/setup_pinecone.py +++ b/pypi_scout/scripts/setup_pinecone.py @@ -33,7 +33,7 @@ def setup_pinecone(): logging.info("✅ Pinecone index created successfully.") except PineconeApiException as e: if e.status == 409: - logging.warning(f"✔️ Pinecone index '{config.PINECONE_INDEX_NAME}' already exists.") + logging.warning(f"🔹 Pinecone index '{config.PINECONE_INDEX_NAME}' already exists.") else: logging.exception("❌ An error occurred while creating the Pinecone index.") diff --git a/pypi_scout/scripts/upload_processed_dataset.py b/pypi_scout/scripts/upload_processed_dataset.py new file mode 100644 index 0000000..5bd4df9 --- /dev/null +++ b/pypi_scout/scripts/upload_processed_dataset.py @@ -0,0 +1,58 @@ +import logging +from pathlib import Path + +import polars as pl +from dotenv import load_dotenv + +from pypi_scout.config import Config, StorageBackend +from pypi_scout.utils.blob_io import BlobIO +from pypi_scout.utils.logging import setup_logging + + +def read_processed_dataset(path_to_processed_dataset: Path): + logging.info("📂 Reading the processed dataset...") + df = pl.read_csv(path_to_processed_dataset) + logging.info("📊 Number of rows in the processed dataset: %s", len(df)) + return df + + +def read_csv_from_local_and_upload(config: Config): + blob_io = BlobIO( + config.STORAGE_BACKEND_BLOB_ACCOUNT_NAME, + config.STORAGE_BACKEND_BLOB_CONTAINER_NAME, + config.STORAGE_BACKEND_BLOB_KEY, + ) + + if blob_io.exists(config.PROCESSED_DATASET_CSV_NAME): + if not config.OVERWRITE: + logging.info( + f"🔹 Processed dataset {config.PROCESSED_DATASET_CSV_NAME} already exists in container '{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}'! Skipping upload." + ) + return + else: + logging.info( + f"⤵️ Processed dataset {config.PROCESSED_DATASET_CSV_NAME} already exists in container '{config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}', but config.OVERWRITE is `true`. Overwriting..." + ) + + df = read_processed_dataset(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME) + logging.info( + f"Uploading {config.PROCESSED_DATASET_CSV_NAME} to blob container {config.STORAGE_BACKEND_BLOB_CONTAINER_NAME}..." + ) + blob_io.upload_csv(df, config.PROCESSED_DATASET_CSV_NAME) + logging.info("✅ Done!") + + +def upload_processed_dataset(): + load_dotenv() + config = Config() + if config.STORAGE_BACKEND != StorageBackend.BLOB: + logging.info( + "Not using BLOB backend. Skipping upload. To enable, configure the `STORAGE_BACKEND_` variables in config" + ) + return + read_csv_from_local_and_upload(config) + + +if __name__ == "__main__": + setup_logging() + upload_processed_dataset() diff --git a/pypi_scout/utils/blob_io.py b/pypi_scout/utils/blob_io.py index 28d0078..74eaa5e 100644 --- a/pypi_scout/utils/blob_io.py +++ b/pypi_scout/utils/blob_io.py @@ -1,3 +1,4 @@ +import tempfile from io import BytesIO import polars as pl @@ -29,9 +30,17 @@ def upload_local_csv(self, local_file_path: str, blob_name: str) -> None: def download_csv(self, blob_name: str) -> pl.DataFrame: blob_client = self.container_client.get_blob_client(blob_name) download_stream = blob_client.download_blob() - csv_content = download_stream.content_as_text() - csv_buffer = StringIO(csv_content) - return pl.read_csv(csv_buffer) + + # Create a temporary file + with tempfile.NamedTemporaryFile(delete=True) as temp_file: + # Download the blob content into the temporary file + temp_file.write(download_stream.readall()) + temp_file.flush() + + # Read the CSV using Polars + df = pl.read_csv(temp_file.name) + + return df def exists(self, blob_name): blob_client = self.container_client.get_blob_client(blob_name) diff --git a/start.sh b/start.sh deleted file mode 100755 index a8cfd13..0000000 --- a/start.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh -set -e -python pypi_scout/scripts/setup.py --no-upsert -uvicorn pypi_scout.api.main:app --host 0.0.0.0 --port 8000 From 03ffaf7ad856463d15a66388a6766ab57e36e524 Mon Sep 17 00:00:00 2001 From: Florian Maas Date: Fri, 21 Jun 2024 15:15:18 +0200 Subject: [PATCH 13/22] fix reqs for cpu --- requirements-cpu.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-cpu.txt b/requirements-cpu.txt index 7df01fe..ea1ce89 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -13,5 +13,6 @@ uvicorn==0.30.1 gdown==5.2.0 torch==2.0.1 numpy==1.24.4 +azure-storage-blob==12.20.0 --index-url=https://download.pytorch.org/whl/cpu --extra-index-url=https://pypi.org/simple From d7994f74f453f32f9a4d5da5af822e8978b6cac9 Mon Sep 17 00:00:00 2001 From: Florian Maas Date: Fri, 21 Jun 2024 15:35:32 +0200 Subject: [PATCH 14/22] fix max width --- frontend/app/globals.css | 11 +++-------- frontend/app/page.tsx | 2 +- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/frontend/app/globals.css b/frontend/app/globals.css index 3216bc2..7c5c7b4 100644 --- a/frontend/app/globals.css +++ b/frontend/app/globals.css @@ -6,8 +6,8 @@ --foreground-rgb: 0, 0, 0; --background-start-rgb: 214, 219, 220; --background-end-rgb: 255, 255, 255; - --dark-bg-start-rgb: 10, 10, 35; /* Very dark blue almost grey */ - --dark-bg-end-rgb: 25, 25, 50; /* Dark blue */ + --dark-bg-start-rgb: 17, 24, 39; /* Dark gray (bg-gray-900) */ + --dark-bg-end-rgb: 17, 24, 39; /* Dark gray (bg-gray-900) */ --dark-foreground-rgb: 255, 255, 255; } @@ -21,12 +21,7 @@ body { color: rgb(var(--foreground-rgb)); - background: linear-gradient( - to bottom, - transparent, - rgb(var(--background-end-rgb)) - ) - rgb(var(--background-start-rgb)); + background: rgb(var(--background-start-rgb)); /* Solid background color */ } @layer utilities { diff --git a/frontend/app/page.tsx b/frontend/app/page.tsx index c3ce2cc..f9b603f 100644 --- a/frontend/app/page.tsx +++ b/frontend/app/page.tsx @@ -31,7 +31,7 @@ export default function Home() { }; return ( -
+
Date: Fri, 21 Jun 2024 16:19:25 +0200 Subject: [PATCH 15/22] add URL --- .github/workflows/publish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 991882e..737c04c 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -54,4 +54,4 @@ jobs: push: true tags: pypiscoutacr.azurecr.io/pypi-scout-frontend:latest build-args: | - NEXT_PUBLIC_API_URL=${{ env.HOST }}/api + NEXT_PUBLIC_API_URL=https://pypiscout.com/api From 9dc03ce5b5ad7ce259250d965cc985760802f060 Mon Sep 17 00:00:00 2001 From: Florian Maas Date: Sat, 22 Jun 2024 12:25:40 +0200 Subject: [PATCH 16/22] add github and kofi badges, add rate limiter --- frontend/app/components/GitHubButton.tsx | 25 +++ .../app/components/SearchResultsTable.tsx | 10 +- frontend/app/components/SupportButton.tsx | 23 +++ frontend/app/page.tsx | 21 ++- frontend/app/utils/search.ts | 11 +- frontend/public/kofi.png | Bin 0 -> 35716 bytes poetry.lock | 144 +++++++++++++++++- pypi_scout/api/main.py | 32 +++- pypi_scout/config.py | 4 +- pypi_scout/vector_database/interface.py | 1 + pyproject.toml | 1 + 11 files changed, 251 insertions(+), 21 deletions(-) create mode 100644 frontend/app/components/GitHubButton.tsx create mode 100644 frontend/app/components/SupportButton.tsx create mode 100644 frontend/public/kofi.png diff --git a/frontend/app/components/GitHubButton.tsx b/frontend/app/components/GitHubButton.tsx new file mode 100644 index 0000000..8104cc5 --- /dev/null +++ b/frontend/app/components/GitHubButton.tsx @@ -0,0 +1,25 @@ +import React from "react"; + +const GitHubButton: React.FC = () => { + return ( + + + + + GitHub + + ); +}; + +export default GitHubButton; diff --git a/frontend/app/components/SearchResultsTable.tsx b/frontend/app/components/SearchResultsTable.tsx index 4e866c3..befefa5 100644 --- a/frontend/app/components/SearchResultsTable.tsx +++ b/frontend/app/components/SearchResultsTable.tsx @@ -25,6 +25,12 @@ const SearchResultsTable: React.FC = ({ return sortField === field ? (sortDirection === "asc" ? "▲" : "▼") : ""; }; + const truncateText = (text: string, maxLength: number) => { + return text.length > maxLength + ? `${text.substring(0, maxLength)}...` + : text; + }; + return (
@@ -70,7 +76,7 @@ const SearchResultsTable: React.FC = ({ {results.map((result, index) => ( -
- {result.name} + {truncateText(result.name, 20)} {result.similarity.toFixed(3)} @@ -78,7 +84,7 @@ const SearchResultsTable: React.FC = ({ {result.weekly_downloads.toLocaleString()} + {result.summary} diff --git a/frontend/app/components/SupportButton.tsx b/frontend/app/components/SupportButton.tsx new file mode 100644 index 0000000..a55a2fb --- /dev/null +++ b/frontend/app/components/SupportButton.tsx @@ -0,0 +1,23 @@ +import React from "react"; + +const SupportButton: React.FC = () => { + return ( + + Ko-fi logo + Support + + ); +}; + +export default SupportButton; diff --git a/frontend/app/page.tsx b/frontend/app/page.tsx index f9b603f..b539365 100644 --- a/frontend/app/page.tsx +++ b/frontend/app/page.tsx @@ -5,6 +5,8 @@ import { handleSearch, sortResults } from "./utils/search"; import SearchResultsTable from "./components/SearchResultsTable"; import InfoBox from "./components/InfoBox"; import { ClipLoader } from "react-spinners"; +import GitHubButton from "./components/GitHubButton"; +import SupportButton from "./components/SupportButton"; interface Match { name: string; @@ -31,20 +33,29 @@ export default function Home() { }; return ( -
-
+
+
+
+ + +
+
+ +
pypi-scout logo

Enter your query to search for Python packages

-
+ +
);