From 3f3673f37e51e940042750b1d0b26adacbee48e9 Mon Sep 17 00:00:00 2001 From: David Pomerenke <46022183+davidpomerenke@users.noreply.github.com> Date: Fri, 6 Dec 2024 21:59:53 +0100 Subject: [PATCH] feat(backend): add opencollective donations data exploration --- .../data_loaders/donations/opencollective.py | 226 ++++++++ backend-python/poetry.lock | 57 +- backend-python/pyproject.toml | 1 + .../2024-11-17-david-opencollective.ipynb | 508 ++++++++++++++++++ 4 files changed, 791 insertions(+), 1 deletion(-) create mode 100644 backend-python/media_impact_monitor/data_loaders/donations/opencollective.py create mode 100644 notebooks/2024-11-17-david-opencollective.ipynb diff --git a/backend-python/media_impact_monitor/data_loaders/donations/opencollective.py b/backend-python/media_impact_monitor/data_loaders/donations/opencollective.py new file mode 100644 index 0000000..7161fd2 --- /dev/null +++ b/backend-python/media_impact_monitor/data_loaders/donations/opencollective.py @@ -0,0 +1,226 @@ +from gql import gql, Client +from gql.transport.requests import RequestsHTTPTransport +import os +from dotenv import load_dotenv +import pandas as pd +import matplotlib.pyplot as plt +from joblib import Memory +from tqdm.auto import tqdm +import time +import re + +# Setup caching +memory = Memory(".cache", verbose=0) +cache = memory.cache +wait_time = 1 + +load_dotenv() +transport = RequestsHTTPTransport( + url="https://api.opencollective.com/graphql/v2", + headers={"Personal-Token": os.getenv("OPENCOLLECTIVE_API_TOKEN")}, +) +client = Client(transport=transport, fetch_schema_from_transport=True) + + +@cache(ignore=["wait_time"]) +def fetch(query, variable_values, wait_time=0.1, **kwargs): + time.sleep(wait_time) + return client.execute(query, variable_values=variable_values, **kwargs) + + +def fetch_climate_orgs(limit=1000): + # Define search terms + search_terms = [ + "climate", + "for future", + "extinction rebellion", + "xr", + "fossil", + "oil", + ] + + query = gql(""" + query GetAccounts($limit: Int, $offset: Int, $searchTerm: String) { + accounts( + limit: $limit + offset: $offset + isActive: true + searchTerm: $searchTerm + type: COLLECTIVE + ) { + nodes { + slug + name + legalName + description + longDescription + tags + location { + name + address + country + } + stats { + totalAmountReceived { + value + currency + valueInCents + } + totalAmountReceivedTimeSeries { + dateFrom + dateTo + timeUnit + nodes { + date + amount { + value + currency + valueInCents + } + label + } + } + } + } + } + } + """) + + all_orgs = [] + seen_slugs = set() # To prevent duplicates + + # Fetch orgs for each search term + for term in search_terms: + response = fetch( + query, variable_values={"limit": limit, "offset": 0, "searchTerm": term} + ) + + # Add only unique organizations + for org in response["accounts"]["nodes"]: + if org["slug"] not in seen_slugs: + all_orgs.append(org) + seen_slugs.add(org["slug"]) + + print(f"Found {len(all_orgs)} unique organizations") + return all_orgs + + +# Fetch transactions for an organization with pagination +@cache +def fetch_transactions(org_slug, total_limit=100_000, page_size=1000): + query = gql(""" + query GetAccountTransactions( + $account: [AccountReferenceInput!] + $limit: Int! + $offset: Int! + $orderBy: ChronologicalOrderInput! + ) { + transactions( + account: $account + limit: $limit + offset: $offset + orderBy: $orderBy + ) { + nodes { + id + createdAt + type + amount { + value + currency + } + } + totalCount + } + } + """) + + all_transactions = [] + offset = 0 + while offset < total_limit: + variables = { + "account": [{"slug": org_slug}], + "limit": min(page_size, total_limit - offset), + "offset": offset, + "orderBy": {"field": "CREATED_AT", "direction": "DESC"}, + } + + response = fetch(query, variables, wait_time) + transactions = response["transactions"]["nodes"] + total_count = response["transactions"]["totalCount"] + + all_transactions.extend(transactions) + + # Break if we've fetched all available transactions + if len(transactions) < page_size or offset + page_size >= total_count: + break + + offset += page_size + print(f"Fetched {len(all_transactions)} transactions for {org_slug}") + return all_transactions + +def get_transactions_df(orgs): + all_transactions = [] + for org in tqdm(orgs): + transactions = fetch_transactions(org["slug"]) + if transactions: + # Convert to DataFrame with just date and amount + df = pd.DataFrame( + [ + { + "date": pd.to_datetime(t["createdAt"]).floor("D"), # Floor to day + "amount": float(t["amount"]["value"]) if "amount" in t else 0, + } + for t in transactions + ] + ) + if not df.empty: + df["organization"] = org["name"] + all_transactions.append(df) + if not all_transactions: + return None + return pd.concat(all_transactions) + +def generalize_group_name(name): + if re.search(r"xr|extinction.?rebellion|scientist.?rebellion", name.lower()): + return "Extinction Rebellion" + elif re.search(r"(4|for).?future|fff|klimatreik", name.lower()): + return "Fridays For Future" + elif re.search(r"fossil.?free", name.lower()): + return "Fossil Free" + else: + return name + +def group_by_wealth(df, top_n=10): + # Calculate total donations per organization + total_by_org = df.groupby("organization")["amount"].sum().sort_values(ascending=False) + # Get top N organizations + top_orgs = set(total_by_org.head(top_n).index) + # Create a mapping function + def map_org(org): + return org if org in top_orgs else "Other" + return df.assign(organization=df["organization"].apply(map_org)) + +def get_monthly_dfs(df, pivot=False): + monthly = ( + df.set_index("date") + .groupby(["organization", pd.Grouper(freq="W")])["amount"] + .sum() + .reset_index() + ) + + # Create separate positive and negative DataFrames + positive_df = monthly[monthly["amount"] > 0].copy() + negative_df = monthly[monthly["amount"] < 0].copy() + + if pivot: + # Pivot to get organizations as columns + positive_pivot = positive_df.pivot( + index="date", columns="organization", values="amount" + ).fillna(0) + negative_pivot = negative_df.pivot( + index="date", columns="organization", values="amount" + ).fillna(0) + return positive_pivot, negative_pivot + else: + return positive_df, negative_df \ No newline at end of file diff --git a/backend-python/poetry.lock b/backend-python/poetry.lock index 333f7f1..fa244b3 100644 --- a/backend-python/poetry.lock +++ b/backend-python/poetry.lock @@ -1666,6 +1666,47 @@ all = ["jieba", "nltk"] arabic = ["nltk"] chinese = ["jieba"] +[[package]] +name = "gql" +version = "3.5.0" +description = "GraphQL client for Python" +optional = false +python-versions = "*" +files = [ + {file = "gql-3.5.0-py2.py3-none-any.whl", hash = "sha256:70dda5694a5b194a8441f077aa5fb70cc94e4ec08016117523f013680901ecb7"}, + {file = "gql-3.5.0.tar.gz", hash = "sha256:ccb9c5db543682b28f577069950488218ed65d4ac70bb03b6929aaadaf636de9"}, +] + +[package.dependencies] +anyio = ">=3.0,<5" +backoff = ">=1.11.1,<3.0" +graphql-core = ">=3.2,<3.3" +requests = {version = ">=2.26,<3", optional = true, markers = "extra == \"requests\""} +requests-toolbelt = {version = ">=1.0.0,<2", optional = true, markers = "extra == \"requests\""} +yarl = ">=1.6,<2.0" + +[package.extras] +aiohttp = ["aiohttp (>=3.8.0,<4)", "aiohttp (>=3.9.0b0,<4)"] +all = ["aiohttp (>=3.8.0,<4)", "aiohttp (>=3.9.0b0,<4)", "botocore (>=1.21,<2)", "httpx (>=0.23.1,<1)", "requests (>=2.26,<3)", "requests-toolbelt (>=1.0.0,<2)", "websockets (>=10,<12)"] +botocore = ["botocore (>=1.21,<2)"] +dev = ["aiofiles", "aiohttp (>=3.8.0,<4)", "aiohttp (>=3.9.0b0,<4)", "black (==22.3.0)", "botocore (>=1.21,<2)", "check-manifest (>=0.42,<1)", "flake8 (==3.8.1)", "httpx (>=0.23.1,<1)", "isort (==4.3.21)", "mock (==4.0.2)", "mypy (==0.910)", "parse (==1.15.0)", "pytest (==7.4.2)", "pytest-asyncio (==0.21.1)", "pytest-console-scripts (==1.3.1)", "pytest-cov (==3.0.0)", "requests (>=2.26,<3)", "requests-toolbelt (>=1.0.0,<2)", "sphinx (>=5.3.0,<6)", "sphinx-argparse (==0.2.5)", "sphinx-rtd-theme (>=0.4,<1)", "types-aiofiles", "types-mock", "types-requests", "vcrpy (==4.4.0)", "websockets (>=10,<12)"] +httpx = ["httpx (>=0.23.1,<1)"] +requests = ["requests (>=2.26,<3)", "requests-toolbelt (>=1.0.0,<2)"] +test = ["aiofiles", "aiohttp (>=3.8.0,<4)", "aiohttp (>=3.9.0b0,<4)", "botocore (>=1.21,<2)", "httpx (>=0.23.1,<1)", "mock (==4.0.2)", "parse (==1.15.0)", "pytest (==7.4.2)", "pytest-asyncio (==0.21.1)", "pytest-console-scripts (==1.3.1)", "pytest-cov (==3.0.0)", "requests (>=2.26,<3)", "requests-toolbelt (>=1.0.0,<2)", "vcrpy (==4.4.0)", "websockets (>=10,<12)"] +test-no-transport = ["aiofiles", "mock (==4.0.2)", "parse (==1.15.0)", "pytest (==7.4.2)", "pytest-asyncio (==0.21.1)", "pytest-console-scripts (==1.3.1)", "pytest-cov (==3.0.0)", "vcrpy (==4.4.0)"] +websockets = ["websockets (>=10,<12)"] + +[[package]] +name = "graphql-core" +version = "3.2.5" +description = "GraphQL implementation for Python, a port of GraphQL.js, the JavaScript reference implementation for GraphQL." +optional = false +python-versions = "<4,>=3.6" +files = [ + {file = "graphql_core-3.2.5-py3-none-any.whl", hash = "sha256:2f150d5096448aa4f8ab26268567bbfeef823769893b39c1a2e1409590939c8a"}, + {file = "graphql_core-3.2.5.tar.gz", hash = "sha256:e671b90ed653c808715645e3998b7ab67d382d55467b7e2978549111bbabf8d5"}, +] + [[package]] name = "graphviz" version = "0.20.3" @@ -4961,6 +5002,20 @@ files = [ [package.dependencies] requests = ">=1.0.0" +[[package]] +name = "requests-toolbelt" +version = "1.0.0" +description = "A utility belt for advanced users of python-requests" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"}, + {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"}, +] + +[package.dependencies] +requests = ">=2.0.1,<3.0.0" + [[package]] name = "rfc3339-validator" version = "0.1.4" @@ -6426,4 +6481,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "d48054e0a4c54693410b1dccf4fd8327198a67dffae1c2ebafe4515e2b44dede" +content-hash = "1a5e7188e8c22601c5a45c85897bde3e483643cdb05e2c4f83e136cb077ce5bd" diff --git a/backend-python/pyproject.toml b/backend-python/pyproject.toml index 5a6f109..6191a11 100644 --- a/backend-python/pyproject.toml +++ b/backend-python/pyproject.toml @@ -44,6 +44,7 @@ json-repair = "^0.26.0" freezegun = "^1.5.1" aiolimiter = "^1.1.0" pytest-asyncio = "^0.23.8" +gql = {extras = ["requests"], version = "^3.5.0"} [tool.poetry.group.dev.dependencies] pytest = "^8.0.2" diff --git a/notebooks/2024-11-17-david-opencollective.ipynb b/notebooks/2024-11-17-david-opencollective.ipynb new file mode 100644 index 0000000..371f922 --- /dev/null +++ b/notebooks/2024-11-17-david-opencollective.ipynb @@ -0,0 +1,508 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 901 unique organizations\n" + ] + }, + { + "data": { + "text/plain": [ + "{'slug': 'climate-caucus',\n", + " 'name': 'Climate Caucus',\n", + " 'legalName': None,\n", + " 'description': 'Connect elected Canadian representatives to drive climate action',\n", + " 'longDescription': None,\n", + " 'tags': ['community'],\n", + " 'location': None,\n", + " 'stats': {'totalAmountReceived': {'value': 0,\n", + " 'currency': 'USD',\n", + " 'valueInCents': 0},\n", + " 'totalAmountReceivedTimeSeries': {'dateFrom': None,\n", + " 'dateTo': None,\n", + " 'timeUnit': 'YEAR',\n", + " 'nodes': []}}}" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from media_impact_monitor.data_loaders.donations.opencollective import (\n", + " fetch_climate_orgs,\n", + " fetch_transactions,\n", + " get_transactions_df,\n", + " get_monthly_dfs,\n", + " group_by_wealth,\n", + " generalize_group_name\n", + ")\n", + "\n", + "orgs = fetch_climate_orgs()\n", + "orgs[0]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'id': 'b04b2277-1fb2-4a91-a682-26296a21cf9d',\n", + " 'createdAt': '2024-07-19T11:43:33.884Z',\n", + " 'type': 'DEBIT',\n", + " 'amount': {'value': -28.78, 'currency': 'GBP'}},\n", + " {'id': '463bbc47-e299-418f-b021-7d9a8cd99717',\n", + " 'createdAt': '2023-12-15T08:21:06.535Z',\n", + " 'type': 'DEBIT',\n", + " 'amount': {'value': -26.38, 'currency': 'GBP'}},\n", + " {'id': 'c1f4c332-d51a-49af-8cdf-e2e7ca9902e7',\n", + " 'createdAt': '2023-03-29T12:03:06.051Z',\n", + " 'type': 'DEBIT',\n", + " 'amount': {'value': -42, 'currency': 'GBP'}},\n", + " {'id': '0c936cd0-05fb-49a4-9692-8dec4cdaa501',\n", + " 'createdAt': '2023-03-29T12:03:04.254Z',\n", + " 'type': 'DEBIT',\n", + " 'amount': {'value': -22, 'currency': 'GBP'}}]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transactions = fetch_transactions(orgs[1][\"slug\"])\n", + "transactions[:4]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "36de5496dea24b03af1fabace1c03306", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/10 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dateamountorganization
02024-07-19 00:00:00+00:00-28.78Climate Hub Wandsworth
12023-12-15 00:00:00+00:00-26.38Climate Hub Wandsworth
22023-03-29 00:00:00+00:00-42.00Climate Hub Wandsworth
32023-03-29 00:00:00+00:00-22.00Climate Hub Wandsworth
42022-09-21 00:00:00+00:00-0.90Climate Hub Wandsworth
............
302024-02-12 00:00:00+00:005.00Climate Collective Oxford
312024-01-19 00:00:00+00:00-42.00Climate Collective Oxford
322024-01-18 00:00:00+00:00-2.50Climate Collective Oxford
332024-01-18 00:00:00+00:00-1.03Climate Collective Oxford
342024-01-18 00:00:00+00:0050.00Climate Collective Oxford
\n", + "

343 rows × 3 columns

\n", + "" + ], + "text/plain": [ + " date amount organization\n", + "0 2024-07-19 00:00:00+00:00 -28.78 Climate Hub Wandsworth\n", + "1 2023-12-15 00:00:00+00:00 -26.38 Climate Hub Wandsworth\n", + "2 2023-03-29 00:00:00+00:00 -42.00 Climate Hub Wandsworth\n", + "3 2023-03-29 00:00:00+00:00 -22.00 Climate Hub Wandsworth\n", + "4 2022-09-21 00:00:00+00:00 -0.90 Climate Hub Wandsworth\n", + ".. ... ... ...\n", + "30 2024-02-12 00:00:00+00:00 5.00 Climate Collective Oxford\n", + "31 2024-01-19 00:00:00+00:00 -42.00 Climate Collective Oxford\n", + "32 2024-01-18 00:00:00+00:00 -2.50 Climate Collective Oxford\n", + "33 2024-01-18 00:00:00+00:00 -1.03 Climate Collective Oxford\n", + "34 2024-01-18 00:00:00+00:00 50.00 Climate Collective Oxford\n", + "\n", + "[343 rows x 3 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = get_transactions_df(orgs[:10])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "217446ed4f4641ada94e077293a05bb7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/150 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
organizationdateamount
0Climate Crisis Collective2019-08-25 00:00:00+00:0025000.00
2Climate Crisis Collective2019-09-15 00:00:00+00:00200.00
5Climate Crisis Collective2019-10-06 00:00:00+00:0025000.00
8Climate Crisis Collective2019-12-01 00:00:00+00:0013500.00
10Climate Crisis Collective2019-12-22 00:00:00+00:00466.00
............
1017The Xylom2024-09-15 00:00:00+00:0017696.96
1020The Xylom2024-10-13 00:00:00+00:0013.26
1022The Xylom2024-10-27 00:00:00+00:006.67
1026The Xylom2024-12-01 00:00:00+00:0019.95
1027The Xylom2024-12-08 00:00:00+00:00419.17
\n", + "

489 rows × 3 columns

\n", + "" + ], + "text/plain": [ + " organization date amount\n", + "0 Climate Crisis Collective 2019-08-25 00:00:00+00:00 25000.00\n", + "2 Climate Crisis Collective 2019-09-15 00:00:00+00:00 200.00\n", + "5 Climate Crisis Collective 2019-10-06 00:00:00+00:00 25000.00\n", + "8 Climate Crisis Collective 2019-12-01 00:00:00+00:00 13500.00\n", + "10 Climate Crisis Collective 2019-12-22 00:00:00+00:00 466.00\n", + "... ... ... ...\n", + "1017 The Xylom 2024-09-15 00:00:00+00:00 17696.96\n", + "1020 The Xylom 2024-10-13 00:00:00+00:00 13.26\n", + "1022 The Xylom 2024-10-27 00:00:00+00:00 6.67\n", + "1026 The Xylom 2024-12-01 00:00:00+00:00 19.95\n", + "1027 The Xylom 2024-12-08 00:00:00+00:00 419.17\n", + "\n", + "[489 rows x 3 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "positive" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.FacetChart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import altair as alt\n", + "\n", + "\n", + "alt.Chart(positive).mark_area().encode(\n", + " x=\"date:T\", # T: temporal\n", + " y=\"amount:Q\", # Q: quantitative\n", + " color=\"organization:N\", # N: nominal\n", + " tooltip=[\"date\", \"amount\"],\n", + ").properties(width=600, height=100).facet(\n", + " row='organization:N'\n", + ").show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}