Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
arnidan committed Dec 11, 2024
1 parent 4955814 commit 18b0015
Show file tree
Hide file tree
Showing 11 changed files with 514 additions and 0 deletions.
16 changes: 16 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
__pycache__
*.pyc
*.pyo
*.pyd
.Python
env/
venv/
.env
*.log
.git
.gitignore
.pytest_cache/
.coverage
htmlcov/
.DS_Store
chrome5/
59 changes: 59 additions & 0 deletions .github/workflows/build-docker.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
name: build docker & deploy

on:
push:
branches:
- main
pull_request:
workflow_dispatch:

jobs:
build:
runs-on: ubuntu-latest
permissions:
packages: write
contents: read
outputs:
IMAGE_ID: ${{ steps.prepare.outputs.IMAGE_ID }}
steps:
- uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Log in to registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Prepare
id: prepare
run: |
IMAGE_ID=ghcr.io/${{ github.repository }}
# Change all uppercase to lowercase
IMAGE_ID=$(echo $IMAGE_ID | tr '[A-Z]' '[a-z]')
VERSION="${{ github.ref_name }}"
# Strip slash from tag name
[[ "${{ github.ref_type }}" == "branch" ]] && VERSION=$(echo $VERSION | sed 's/\//_/')
# Strip "v" prefix from tag name
[[ "${{ github.ref_type }}" == "tag" ]] && VERSION=$(echo $VERSION | sed -e 's/^v//')
# Use Docker `latest` tag convention
[ "$VERSION" == "main" ] && VERSION=latest
echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
echo "IMAGE_ID=$IMAGE_ID" >> $GITHUB_OUTPUT
- name: Build and push
uses: docker/build-push-action@v6
with:
context: .
push: true
tags: |
${{ steps.prepare.outputs.IMAGE_ID }}:${{ steps.prepare.outputs.VERSION }}
${{ steps.prepare.outputs.IMAGE_ID }}:${{ github.run_id }}
cache-from: type=gha
cache-to: type=gha,mode=max
54 changes: 54 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg

# Virtual Environment
venv/
env/
ENV/

# IDE
.idea/
.vscode/
*.swp
*.swo
.DS_Store

# Project specific
chrome5/
*.tflite
*.pb
*.pb.gz
*.tsv
*.json

# Logs
*.log
logs/
log/

# Test coverage
.coverage
htmlcov/
.pytest_cache/
.tox/
coverage.xml
*.cover
38 changes: 38 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
FROM python:3.8-slim as downloader

Check warning on line 1 in Dockerfile

View workflow job for this annotation

GitHub Actions / build

The 'as' keyword should match the case of the 'from' keyword

FromAsCasing: 'as' and 'FROM' keywords' casing do not match More info: https://docs.docker.com/go/dockerfile/rule/from-as-casing/

RUN apt-get update && apt-get install -y curl

WORKDIR /model

COPY download_model.sh .
RUN chmod +x download_model.sh && ./download_model.sh

FROM python:3.8-slim as builder

Check warning on line 10 in Dockerfile

View workflow job for this annotation

GitHub Actions / build

The 'as' keyword should match the case of the 'from' keyword

FromAsCasing: 'as' and 'FROM' keywords' casing do not match More info: https://docs.docker.com/go/dockerfile/rule/from-as-casing/

WORKDIR /install

RUN apt-get update && apt-get install -y \
build-essential \
python3-dev

COPY requirements.txt .
RUN pip3 install --prefix=/install -r requirements.txt && \
pip3 install --prefix=/install --extra-index-url https://google-coral.github.io/py-repo/ tflite_runtime

FROM python:3.8-slim

WORKDIR /app

RUN apt-get update && apt-get install -y \
libusb-1.0-0 \
&& rm -rf /var/lib/apt/lists/*

COPY --from=downloader /model/chrome5 ./chrome5
COPY --from=builder /install /usr/local

COPY src/ ./src/

ENV PYTHONPATH=/app
EXPOSE 8000

CMD ["uvicorn", "src.app:app", "--host", "0.0.0.0", "--port", "8000"]
95 changes: 95 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# URL Topic Classifier API

A service that classifies domains into topics using Google Chrome's Topics API model.

## Overview

This service provides a REST API endpoint that accepts domains and returns their classified topics using the Chrome Topics API classifier model. It uses the same model and classification logic as Chrome's Topics API.

This project is based on the [Chrome Topics Classifier](https://github.com/yohhaan/topics_classifier) repository, which provides Python examples for using Chrome's Topics API model.

The Topics API is part of Chrome's Privacy Sandbox initiative, designed to help serve relevant ads without third-party cookies. You can learn more about it in the [Chrome Topics API documentation](https://developer.chrome.com/docs/privacy-sandbox/topics/).

## Quick Start with Docker

1. Build and run with Docker:
```bash
docker build -t url-classifier .
docker run -p 8000:8000 url-classifier
```

The API will be available at `http://localhost:8000`

## Manual Setup

_Note: It seems to work only with Python 3.8._

1. Create a virtual environment and activate it:
```bash
python -m venv venv
source venv/bin/activate
```

2. Install dependencies:
```bash
pip install -r requirements.txt
pip3 install --extra-index-url https://google-coral.github.io/py-repo/ tflite_runtime
```

3. Download the model files:
```bash
chmod +x scripts/download_model.sh
./scripts/download_model.sh
```

4. Run the application:
```bash
uvicorn src.app:app --reload
```

## API Usage

### Classify Domain

**Endpoint:** `POST /classify`

**Request:**
```json
{
"domain": "example.com"
}
```

**Response:**
```json
{
"domain": "example.com",
"topics": [
{
"id": 123,
"name": "Topic Name"
}
]
}
```

**cURL Example:**
```bash
curl -X POST "http://localhost:8000/classify" \
-H "Content-Type: application/json" \
-d '{"domain": "example.com"}'
```

### API Documentation

Once the server is running, you can access:
- Swagger UI documentation: `http://localhost:8000/docs`
- ReDoc documentation: `http://localhost:8000/redoc`

## License

MIT License

## Credits

This project uses the Topics API model from Google Chrome's Topics API implementation. The model and classification logic are based on the [Chrome Topics Classifier](https://github.com/yohhaan/topics_classifier) repository.
46 changes: 46 additions & 0 deletions download_model.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/sh

mkdir -p chrome5

REPO_URL="https://raw.githubusercontent.com/yohhaan/topics_classifier/main"

download_file() {
echo "Downloading $1..."
output=$(curl -L "$REPO_URL/$1" --create-dirs -o "$1" 2>&1)

if [ $? -eq 0 ]; then
echo "✓ Successfully downloaded $1"
return 0
else
echo "✗ Error downloading $1"
echo "Curl output:"
echo "$output"
return 1
fi
}

for file in \
"chrome5/config.json" \
"chrome5/model-info.pb" \
"chrome5/model.tflite" \
"chrome5/override_list.pb.gz" \
"chrome5/override_list.tsv" \
"chrome5/taxonomy.tsv" \
"chrome5/utility_buckets.tsv"
do
download_file "$file" || exit 1
done

for file in \
"chrome5/config.json" \
"chrome5/model.tflite" \
"chrome5/taxonomy.tsv" \
"chrome5/override_list.tsv"
do
if [ ! -f "$file" ]; then
echo "Error: Essential file $file is missing!"
exit 1
fi
done

echo "✓ All files downloaded successfully!"
9 changes: 9 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
fastapi>=0.68.0
uvicorn>=0.15.0
pydantic>=1.8.0
black
numpy
pandas
pyarrow
tflite-support>=0.4.3
python-multipart>=0.0.5
46 changes: 46 additions & 0 deletions scripts/download_model.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/sh

mkdir -p chrome5

REPO_URL="https://raw.githubusercontent.com/yohhaan/topics_classifier/main"

download_file() {
echo "Downloading $1..."
output=$(curl -L "$REPO_URL/$1" --create-dirs -o "$1" 2>&1)

if [ $? -eq 0 ]; then
echo "✓ Successfully downloaded $1"
return 0
else
echo "✗ Error downloading $1"
echo "Curl output:"
echo "$output"
return 1
fi
}

for file in \
"chrome5/config.json" \
"chrome5/model-info.pb" \
"chrome5/model.tflite" \
"chrome5/override_list.pb.gz" \
"chrome5/override_list.tsv" \
"chrome5/taxonomy.tsv" \
"chrome5/utility_buckets.tsv"
do
download_file "$file" || exit 1
done

for file in \
"chrome5/config.json" \
"chrome5/model.tflite" \
"chrome5/taxonomy.tsv" \
"chrome5/override_list.tsv"
do
if [ ! -f "$file" ]; then
echo "Error: Essential file $file is missing!"
exit 1
fi
done

echo "✓ All files downloaded successfully!"
1 change: 1 addition & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Empty file to make the directory a Python package
18 changes: 18 additions & 0 deletions src/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from src.classifier import TopicsClassifier

app = FastAPI(title="Topics Classification API")

class DomainRequest(BaseModel):
domain: str

classifier = TopicsClassifier()

@app.post("/classify")
async def classify_domain(request: DomainRequest):
try:
topics = classifier.classify_domain(request.domain)
return {"domain": request.domain, "topics": topics}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
Loading

0 comments on commit 18b0015

Please sign in to comment.