init

arnidan · Dec 11, 2024 · 18b0015 · 18b0015
1 parent 4955814
commit 18b0015
Show file tree

Hide file tree

Showing 11 changed files with 514 additions and 0 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,16 @@
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+env/
+venv/
+.env
+*.log
+.git
+.gitignore
+.pytest_cache/
+.coverage
+htmlcov/
+.DS_Store 
+chrome5/
diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml
@@ -0,0 +1,59 @@
+name: build docker & deploy
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    permissions:
+      packages: write
+      contents: read
+    outputs:
+      IMAGE_ID: ${{ steps.prepare.outputs.IMAGE_ID }}
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Prepare
+        id: prepare
+        run: |
+          IMAGE_ID=ghcr.io/${{ github.repository }}
+
+          # Change all uppercase to lowercase
+          IMAGE_ID=$(echo $IMAGE_ID | tr '[A-Z]' '[a-z]')
+
+          VERSION="${{ github.ref_name }}"
+          # Strip slash from tag name
+          [[ "${{ github.ref_type }}" == "branch" ]] && VERSION=$(echo $VERSION | sed 's/\//_/')
+          # Strip "v" prefix from tag name
+          [[ "${{ github.ref_type }}" == "tag" ]] && VERSION=$(echo $VERSION | sed -e 's/^v//')
+          # Use Docker `latest` tag convention
+          [ "$VERSION" == "main" ] && VERSION=latest
+
+          echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
+          echo "IMAGE_ID=$IMAGE_ID" >> $GITHUB_OUTPUT
+
+      - name: Build and push
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          push: true
+          tags: |
+            ${{ steps.prepare.outputs.IMAGE_ID }}:${{ steps.prepare.outputs.VERSION }}
+            ${{ steps.prepare.outputs.IMAGE_ID }}:${{ github.run_id }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,54 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Virtual Environment
+venv/
+env/
+ENV/
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+.DS_Store
+
+# Project specific
+chrome5/
+*.tflite
+*.pb
+*.pb.gz
+*.tsv
+*.json
+
+# Logs
+*.log
+logs/
+log/
+
+# Test coverage
+.coverage
+htmlcov/
+.pytest_cache/
+.tox/
+coverage.xml
+*.cover 
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,38 @@
+FROM python:3.8-slim as downloader
+
+RUN apt-get update && apt-get install -y curl
+
+WORKDIR /model
+
+COPY download_model.sh .
+RUN chmod +x download_model.sh && ./download_model.sh
+
+FROM python:3.8-slim as builder
+
+WORKDIR /install
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    python3-dev 
+
+COPY requirements.txt .
+RUN pip3 install --prefix=/install -r requirements.txt && \
+    pip3 install --prefix=/install --extra-index-url https://google-coral.github.io/py-repo/ tflite_runtime
+
+FROM python:3.8-slim
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y \
+    libusb-1.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY --from=downloader /model/chrome5 ./chrome5
+COPY --from=builder /install /usr/local
+
+COPY src/ ./src/
+
+ENV PYTHONPATH=/app
+EXPOSE 8000
+
+CMD ["uvicorn", "src.app:app", "--host", "0.0.0.0", "--port", "8000"] 
diff --git a/README.md b/README.md
@@ -0,0 +1,95 @@
+# URL Topic Classifier API
+
+A service that classifies domains into topics using Google Chrome's Topics API model.
+
+## Overview
+
+This service provides a REST API endpoint that accepts domains and returns their classified topics using the Chrome Topics API classifier model. It uses the same model and classification logic as Chrome's Topics API.
+
+This project is based on the [Chrome Topics Classifier](https://github.com/yohhaan/topics_classifier) repository, which provides Python examples for using Chrome's Topics API model.
+
+The Topics API is part of Chrome's Privacy Sandbox initiative, designed to help serve relevant ads without third-party cookies. You can learn more about it in the [Chrome Topics API documentation](https://developer.chrome.com/docs/privacy-sandbox/topics/).
+
+## Quick Start with Docker
+
+1. Build and run with Docker:
+```bash
+docker build -t url-classifier .
+docker run -p 8000:8000 url-classifier
+```
+
+The API will be available at `http://localhost:8000`
+
+## Manual Setup
+
+_Note: It seems to work only with Python 3.8._
+
+1. Create a virtual environment and activate it:
+```bash
+python -m venv venv
+source venv/bin/activate
+```
+
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+pip3 install --extra-index-url https://google-coral.github.io/py-repo/ tflite_runtime
+```
+
+3. Download the model files:
+```bash
+chmod +x scripts/download_model.sh
+./scripts/download_model.sh
+```
+
+4. Run the application:
+```bash
+uvicorn src.app:app --reload
+```
+
+## API Usage
+
+### Classify Domain
+
+**Endpoint:** `POST /classify`
+
+**Request:**
+```json
+{
+    "domain": "example.com"
+}
+```
+
+**Response:**
+```json
+{
+    "domain": "example.com",
+    "topics": [
+        {
+            "id": 123,
+            "name": "Topic Name"
+        }
+    ]
+}
+```
+
+**cURL Example:**
+```bash
+curl -X POST "http://localhost:8000/classify" \
+     -H "Content-Type: application/json" \
+     -d '{"domain": "example.com"}'
+```
+
+### API Documentation
+
+Once the server is running, you can access:
+- Swagger UI documentation: `http://localhost:8000/docs`
+- ReDoc documentation: `http://localhost:8000/redoc`
+
+## License
+
+MIT License
+
+## Credits
+
+This project uses the Topics API model from Google Chrome's Topics API implementation. The model and classification logic are based on the [Chrome Topics Classifier](https://github.com/yohhaan/topics_classifier) repository.
diff --git a/download_model.sh b/download_model.sh
@@ -0,0 +1,46 @@
+#!/bin/sh
+
+mkdir -p chrome5
+
+REPO_URL="https://raw.githubusercontent.com/yohhaan/topics_classifier/main"
+
+download_file() {
+    echo "Downloading $1..."
+    output=$(curl -L "$REPO_URL/$1" --create-dirs -o "$1" 2>&1)
+
+    if [ $? -eq 0 ]; then
+        echo "✓ Successfully downloaded $1"
+        return 0
+    else
+        echo "✗ Error downloading $1"
+        echo "Curl output:"
+        echo "$output"
+        return 1
+    fi
+}
+
+for file in \
+    "chrome5/config.json" \
+    "chrome5/model-info.pb" \
+    "chrome5/model.tflite" \
+    "chrome5/override_list.pb.gz" \
+    "chrome5/override_list.tsv" \
+    "chrome5/taxonomy.tsv" \
+    "chrome5/utility_buckets.tsv"
+do
+    download_file "$file" || exit 1
+done
+
+for file in \
+    "chrome5/config.json" \
+    "chrome5/model.tflite" \
+    "chrome5/taxonomy.tsv" \
+    "chrome5/override_list.tsv"
+do
+    if [ ! -f "$file" ]; then
+        echo "Error: Essential file $file is missing!"
+        exit 1
+    fi
+done
+
+echo "✓ All files downloaded successfully!"
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,9 @@
+fastapi>=0.68.0
+uvicorn>=0.15.0
+pydantic>=1.8.0
+black
+numpy
+pandas
+pyarrow
+tflite-support>=0.4.3
+python-multipart>=0.0.5 
diff --git a/scripts/download_model.sh b/scripts/download_model.sh
@@ -0,0 +1,46 @@
+#!/bin/sh
+
+mkdir -p chrome5
+
+REPO_URL="https://raw.githubusercontent.com/yohhaan/topics_classifier/main"
+
+download_file() {
+    echo "Downloading $1..."
+    output=$(curl -L "$REPO_URL/$1" --create-dirs -o "$1" 2>&1)
+
+    if [ $? -eq 0 ]; then
+        echo "✓ Successfully downloaded $1"
+        return 0
+    else
+        echo "✗ Error downloading $1"
+        echo "Curl output:"
+        echo "$output"
+        return 1
+    fi
+}
+
+for file in \
+    "chrome5/config.json" \
+    "chrome5/model-info.pb" \
+    "chrome5/model.tflite" \
+    "chrome5/override_list.pb.gz" \
+    "chrome5/override_list.tsv" \
+    "chrome5/taxonomy.tsv" \
+    "chrome5/utility_buckets.tsv"
+do
+    download_file "$file" || exit 1
+done
+
+for file in \
+    "chrome5/config.json" \
+    "chrome5/model.tflite" \
+    "chrome5/taxonomy.tsv" \
+    "chrome5/override_list.tsv"
+do
+    if [ ! -f "$file" ]; then
+        echo "Error: Essential file $file is missing!"
+        exit 1
+    fi
+done
+
+echo "✓ All files downloaded successfully!" 
diff --git a/src/__init__.py b/src/__init__.py
@@ -0,0 +1 @@
+# Empty file to make the directory a Python package 
diff --git a/src/app.py b/src/app.py
@@ -0,0 +1,18 @@
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from src.classifier import TopicsClassifier
+
+app = FastAPI(title="Topics Classification API")
+
+class DomainRequest(BaseModel):
+    domain: str
+
+classifier = TopicsClassifier()
+
+@app.post("/classify")
+async def classify_domain(request: DomainRequest):
+    try:
+        topics = classifier.classify_domain(request.domain)
+        return {"domain": request.domain, "topics": topics}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Empty file to make the directory a Python package