From 3f3673f37e51e940042750b1d0b26adacbee48e9 Mon Sep 17 00:00:00 2001 From: David Pomerenke <46022183+davidpomerenke@users.noreply.github.com> Date: Fri, 6 Dec 2024 21:59:53 +0100 Subject: [PATCH] feat(backend): add opencollective donations data exploration --- .../data_loaders/donations/opencollective.py | 226 ++++++++ backend-python/poetry.lock | 57 +- backend-python/pyproject.toml | 1 + .../2024-11-17-david-opencollective.ipynb | 508 ++++++++++++++++++ 4 files changed, 791 insertions(+), 1 deletion(-) create mode 100644 backend-python/media_impact_monitor/data_loaders/donations/opencollective.py create mode 100644 notebooks/2024-11-17-david-opencollective.ipynb diff --git a/backend-python/media_impact_monitor/data_loaders/donations/opencollective.py b/backend-python/media_impact_monitor/data_loaders/donations/opencollective.py new file mode 100644 index 0000000..7161fd2 --- /dev/null +++ b/backend-python/media_impact_monitor/data_loaders/donations/opencollective.py @@ -0,0 +1,226 @@ +from gql import gql, Client +from gql.transport.requests import RequestsHTTPTransport +import os +from dotenv import load_dotenv +import pandas as pd +import matplotlib.pyplot as plt +from joblib import Memory +from tqdm.auto import tqdm +import time +import re + +# Setup caching +memory = Memory(".cache", verbose=0) +cache = memory.cache +wait_time = 1 + +load_dotenv() +transport = RequestsHTTPTransport( + url="https://api.opencollective.com/graphql/v2", + headers={"Personal-Token": os.getenv("OPENCOLLECTIVE_API_TOKEN")}, +) +client = Client(transport=transport, fetch_schema_from_transport=True) + + +@cache(ignore=["wait_time"]) +def fetch(query, variable_values, wait_time=0.1, **kwargs): + time.sleep(wait_time) + return client.execute(query, variable_values=variable_values, **kwargs) + + +def fetch_climate_orgs(limit=1000): + # Define search terms + search_terms = [ + "climate", + "for future", + "extinction rebellion", + "xr", + "fossil", + "oil", + ] + + query = gql(""" + query GetAccounts($limit: Int, $offset: Int, $searchTerm: String) { + accounts( + limit: $limit + offset: $offset + isActive: true + searchTerm: $searchTerm + type: COLLECTIVE + ) { + nodes { + slug + name + legalName + description + longDescription + tags + location { + name + address + country + } + stats { + totalAmountReceived { + value + currency + valueInCents + } + totalAmountReceivedTimeSeries { + dateFrom + dateTo + timeUnit + nodes { + date + amount { + value + currency + valueInCents + } + label + } + } + } + } + } + } + """) + + all_orgs = [] + seen_slugs = set() # To prevent duplicates + + # Fetch orgs for each search term + for term in search_terms: + response = fetch( + query, variable_values={"limit": limit, "offset": 0, "searchTerm": term} + ) + + # Add only unique organizations + for org in response["accounts"]["nodes"]: + if org["slug"] not in seen_slugs: + all_orgs.append(org) + seen_slugs.add(org["slug"]) + + print(f"Found {len(all_orgs)} unique organizations") + return all_orgs + + +# Fetch transactions for an organization with pagination +@cache +def fetch_transactions(org_slug, total_limit=100_000, page_size=1000): + query = gql(""" + query GetAccountTransactions( + $account: [AccountReferenceInput!] + $limit: Int! + $offset: Int! + $orderBy: ChronologicalOrderInput! + ) { + transactions( + account: $account + limit: $limit + offset: $offset + orderBy: $orderBy + ) { + nodes { + id + createdAt + type + amount { + value + currency + } + } + totalCount + } + } + """) + + all_transactions = [] + offset = 0 + while offset < total_limit: + variables = { + "account": [{"slug": org_slug}], + "limit": min(page_size, total_limit - offset), + "offset": offset, + "orderBy": {"field": "CREATED_AT", "direction": "DESC"}, + } + + response = fetch(query, variables, wait_time) + transactions = response["transactions"]["nodes"] + total_count = response["transactions"]["totalCount"] + + all_transactions.extend(transactions) + + # Break if we've fetched all available transactions + if len(transactions) < page_size or offset + page_size >= total_count: + break + + offset += page_size + print(f"Fetched {len(all_transactions)} transactions for {org_slug}") + return all_transactions + +def get_transactions_df(orgs): + all_transactions = [] + for org in tqdm(orgs): + transactions = fetch_transactions(org["slug"]) + if transactions: + # Convert to DataFrame with just date and amount + df = pd.DataFrame( + [ + { + "date": pd.to_datetime(t["createdAt"]).floor("D"), # Floor to day + "amount": float(t["amount"]["value"]) if "amount" in t else 0, + } + for t in transactions + ] + ) + if not df.empty: + df["organization"] = org["name"] + all_transactions.append(df) + if not all_transactions: + return None + return pd.concat(all_transactions) + +def generalize_group_name(name): + if re.search(r"xr|extinction.?rebellion|scientist.?rebellion", name.lower()): + return "Extinction Rebellion" + elif re.search(r"(4|for).?future|fff|klimatreik", name.lower()): + return "Fridays For Future" + elif re.search(r"fossil.?free", name.lower()): + return "Fossil Free" + else: + return name + +def group_by_wealth(df, top_n=10): + # Calculate total donations per organization + total_by_org = df.groupby("organization")["amount"].sum().sort_values(ascending=False) + # Get top N organizations + top_orgs = set(total_by_org.head(top_n).index) + # Create a mapping function + def map_org(org): + return org if org in top_orgs else "Other" + return df.assign(organization=df["organization"].apply(map_org)) + +def get_monthly_dfs(df, pivot=False): + monthly = ( + df.set_index("date") + .groupby(["organization", pd.Grouper(freq="W")])["amount"] + .sum() + .reset_index() + ) + + # Create separate positive and negative DataFrames + positive_df = monthly[monthly["amount"] > 0].copy() + negative_df = monthly[monthly["amount"] < 0].copy() + + if pivot: + # Pivot to get organizations as columns + positive_pivot = positive_df.pivot( + index="date", columns="organization", values="amount" + ).fillna(0) + negative_pivot = negative_df.pivot( + index="date", columns="organization", values="amount" + ).fillna(0) + return positive_pivot, negative_pivot + else: + return positive_df, negative_df \ No newline at end of file diff --git a/backend-python/poetry.lock b/backend-python/poetry.lock index 333f7f1..fa244b3 100644 --- a/backend-python/poetry.lock +++ b/backend-python/poetry.lock @@ -1666,6 +1666,47 @@ all = ["jieba", "nltk"] arabic = ["nltk"] chinese = ["jieba"] +[[package]] +name = "gql" +version = "3.5.0" +description = "GraphQL client for Python" +optional = false +python-versions = "*" +files = [ + {file = "gql-3.5.0-py2.py3-none-any.whl", hash = "sha256:70dda5694a5b194a8441f077aa5fb70cc94e4ec08016117523f013680901ecb7"}, + {file = "gql-3.5.0.tar.gz", hash = "sha256:ccb9c5db543682b28f577069950488218ed65d4ac70bb03b6929aaadaf636de9"}, +] + +[package.dependencies] +anyio = ">=3.0,<5" +backoff = ">=1.11.1,<3.0" +graphql-core = ">=3.2,<3.3" +requests = {version = ">=2.26,<3", optional = true, markers = "extra == \"requests\""} +requests-toolbelt = {version = ">=1.0.0,<2", optional = true, markers = "extra == \"requests\""} +yarl = ">=1.6,<2.0" + +[package.extras] +aiohttp = ["aiohttp (>=3.8.0,<4)", "aiohttp (>=3.9.0b0,<4)"] +all = ["aiohttp (>=3.8.0,<4)", "aiohttp (>=3.9.0b0,<4)", "botocore (>=1.21,<2)", "httpx (>=0.23.1,<1)", "requests (>=2.26,<3)", "requests-toolbelt (>=1.0.0,<2)", "websockets (>=10,<12)"] +botocore = ["botocore (>=1.21,<2)"] +dev = ["aiofiles", "aiohttp (>=3.8.0,<4)", "aiohttp (>=3.9.0b0,<4)", "black (==22.3.0)", "botocore (>=1.21,<2)", "check-manifest (>=0.42,<1)", "flake8 (==3.8.1)", "httpx (>=0.23.1,<1)", "isort (==4.3.21)", "mock (==4.0.2)", "mypy (==0.910)", "parse (==1.15.0)", "pytest (==7.4.2)", "pytest-asyncio (==0.21.1)", "pytest-console-scripts (==1.3.1)", "pytest-cov (==3.0.0)", "requests (>=2.26,<3)", "requests-toolbelt (>=1.0.0,<2)", "sphinx (>=5.3.0,<6)", "sphinx-argparse (==0.2.5)", "sphinx-rtd-theme (>=0.4,<1)", "types-aiofiles", "types-mock", "types-requests", "vcrpy (==4.4.0)", "websockets (>=10,<12)"] +httpx = ["httpx (>=0.23.1,<1)"] +requests = ["requests (>=2.26,<3)", "requests-toolbelt (>=1.0.0,<2)"] +test = ["aiofiles", "aiohttp (>=3.8.0,<4)", "aiohttp (>=3.9.0b0,<4)", "botocore (>=1.21,<2)", "httpx (>=0.23.1,<1)", "mock (==4.0.2)", "parse (==1.15.0)", "pytest (==7.4.2)", "pytest-asyncio (==0.21.1)", "pytest-console-scripts (==1.3.1)", "pytest-cov (==3.0.0)", "requests (>=2.26,<3)", "requests-toolbelt (>=1.0.0,<2)", "vcrpy (==4.4.0)", "websockets (>=10,<12)"] +test-no-transport = ["aiofiles", "mock (==4.0.2)", "parse (==1.15.0)", "pytest (==7.4.2)", "pytest-asyncio (==0.21.1)", "pytest-console-scripts (==1.3.1)", "pytest-cov (==3.0.0)", "vcrpy (==4.4.0)"] +websockets = ["websockets (>=10,<12)"] + +[[package]] +name = "graphql-core" +version = "3.2.5" +description = "GraphQL implementation for Python, a port of GraphQL.js, the JavaScript reference implementation for GraphQL." +optional = false +python-versions = "<4,>=3.6" +files = [ + {file = "graphql_core-3.2.5-py3-none-any.whl", hash = "sha256:2f150d5096448aa4f8ab26268567bbfeef823769893b39c1a2e1409590939c8a"}, + {file = "graphql_core-3.2.5.tar.gz", hash = "sha256:e671b90ed653c808715645e3998b7ab67d382d55467b7e2978549111bbabf8d5"}, +] + [[package]] name = "graphviz" version = "0.20.3" @@ -4961,6 +5002,20 @@ files = [ [package.dependencies] requests = ">=1.0.0" +[[package]] +name = "requests-toolbelt" +version = "1.0.0" +description = "A utility belt for advanced users of python-requests" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"}, + {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"}, +] + +[package.dependencies] +requests = ">=2.0.1,<3.0.0" + [[package]] name = "rfc3339-validator" version = "0.1.4" @@ -6426,4 +6481,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "d48054e0a4c54693410b1dccf4fd8327198a67dffae1c2ebafe4515e2b44dede" +content-hash = "1a5e7188e8c22601c5a45c85897bde3e483643cdb05e2c4f83e136cb077ce5bd" diff --git a/backend-python/pyproject.toml b/backend-python/pyproject.toml index 5a6f109..6191a11 100644 --- a/backend-python/pyproject.toml +++ b/backend-python/pyproject.toml @@ -44,6 +44,7 @@ json-repair = "^0.26.0" freezegun = "^1.5.1" aiolimiter = "^1.1.0" pytest-asyncio = "^0.23.8" +gql = {extras = ["requests"], version = "^3.5.0"} [tool.poetry.group.dev.dependencies] pytest = "^8.0.2" diff --git a/notebooks/2024-11-17-david-opencollective.ipynb b/notebooks/2024-11-17-david-opencollective.ipynb new file mode 100644 index 0000000..371f922 --- /dev/null +++ b/notebooks/2024-11-17-david-opencollective.ipynb @@ -0,0 +1,508 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 901 unique organizations\n" + ] + }, + { + "data": { + "text/plain": [ + "{'slug': 'climate-caucus',\n", + " 'name': 'Climate Caucus',\n", + " 'legalName': None,\n", + " 'description': 'Connect elected Canadian representatives to drive climate action',\n", + " 'longDescription': None,\n", + " 'tags': ['community'],\n", + " 'location': None,\n", + " 'stats': {'totalAmountReceived': {'value': 0,\n", + " 'currency': 'USD',\n", + " 'valueInCents': 0},\n", + " 'totalAmountReceivedTimeSeries': {'dateFrom': None,\n", + " 'dateTo': None,\n", + " 'timeUnit': 'YEAR',\n", + " 'nodes': []}}}" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from media_impact_monitor.data_loaders.donations.opencollective import (\n", + " fetch_climate_orgs,\n", + " fetch_transactions,\n", + " get_transactions_df,\n", + " get_monthly_dfs,\n", + " group_by_wealth,\n", + " generalize_group_name\n", + ")\n", + "\n", + "orgs = fetch_climate_orgs()\n", + "orgs[0]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'id': 'b04b2277-1fb2-4a91-a682-26296a21cf9d',\n", + " 'createdAt': '2024-07-19T11:43:33.884Z',\n", + " 'type': 'DEBIT',\n", + " 'amount': {'value': -28.78, 'currency': 'GBP'}},\n", + " {'id': '463bbc47-e299-418f-b021-7d9a8cd99717',\n", + " 'createdAt': '2023-12-15T08:21:06.535Z',\n", + " 'type': 'DEBIT',\n", + " 'amount': {'value': -26.38, 'currency': 'GBP'}},\n", + " {'id': 'c1f4c332-d51a-49af-8cdf-e2e7ca9902e7',\n", + " 'createdAt': '2023-03-29T12:03:06.051Z',\n", + " 'type': 'DEBIT',\n", + " 'amount': {'value': -42, 'currency': 'GBP'}},\n", + " {'id': '0c936cd0-05fb-49a4-9692-8dec4cdaa501',\n", + " 'createdAt': '2023-03-29T12:03:04.254Z',\n", + " 'type': 'DEBIT',\n", + " 'amount': {'value': -22, 'currency': 'GBP'}}]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transactions = fetch_transactions(orgs[1][\"slug\"])\n", + "transactions[:4]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "36de5496dea24b03af1fabace1c03306", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/10 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + " | date | \n", + "amount | \n", + "organization | \n", + "
---|---|---|---|
0 | \n", + "2024-07-19 00:00:00+00:00 | \n", + "-28.78 | \n", + "Climate Hub Wandsworth | \n", + "
1 | \n", + "2023-12-15 00:00:00+00:00 | \n", + "-26.38 | \n", + "Climate Hub Wandsworth | \n", + "
2 | \n", + "2023-03-29 00:00:00+00:00 | \n", + "-42.00 | \n", + "Climate Hub Wandsworth | \n", + "
3 | \n", + "2023-03-29 00:00:00+00:00 | \n", + "-22.00 | \n", + "Climate Hub Wandsworth | \n", + "
4 | \n", + "2022-09-21 00:00:00+00:00 | \n", + "-0.90 | \n", + "Climate Hub Wandsworth | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "
30 | \n", + "2024-02-12 00:00:00+00:00 | \n", + "5.00 | \n", + "Climate Collective Oxford | \n", + "
31 | \n", + "2024-01-19 00:00:00+00:00 | \n", + "-42.00 | \n", + "Climate Collective Oxford | \n", + "
32 | \n", + "2024-01-18 00:00:00+00:00 | \n", + "-2.50 | \n", + "Climate Collective Oxford | \n", + "
33 | \n", + "2024-01-18 00:00:00+00:00 | \n", + "-1.03 | \n", + "Climate Collective Oxford | \n", + "
34 | \n", + "2024-01-18 00:00:00+00:00 | \n", + "50.00 | \n", + "Climate Collective Oxford | \n", + "
343 rows × 3 columns
\n", + "\n", + " | organization | \n", + "date | \n", + "amount | \n", + "
---|---|---|---|
0 | \n", + "Climate Crisis Collective | \n", + "2019-08-25 00:00:00+00:00 | \n", + "25000.00 | \n", + "
2 | \n", + "Climate Crisis Collective | \n", + "2019-09-15 00:00:00+00:00 | \n", + "200.00 | \n", + "
5 | \n", + "Climate Crisis Collective | \n", + "2019-10-06 00:00:00+00:00 | \n", + "25000.00 | \n", + "
8 | \n", + "Climate Crisis Collective | \n", + "2019-12-01 00:00:00+00:00 | \n", + "13500.00 | \n", + "
10 | \n", + "Climate Crisis Collective | \n", + "2019-12-22 00:00:00+00:00 | \n", + "466.00 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "
1017 | \n", + "The Xylom | \n", + "2024-09-15 00:00:00+00:00 | \n", + "17696.96 | \n", + "
1020 | \n", + "The Xylom | \n", + "2024-10-13 00:00:00+00:00 | \n", + "13.26 | \n", + "
1022 | \n", + "The Xylom | \n", + "2024-10-27 00:00:00+00:00 | \n", + "6.67 | \n", + "
1026 | \n", + "The Xylom | \n", + "2024-12-01 00:00:00+00:00 | \n", + "19.95 | \n", + "
1027 | \n", + "The Xylom | \n", + "2024-12-08 00:00:00+00:00 | \n", + "419.17 | \n", + "
489 rows × 3 columns
\n", + "