Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 45 acled participant numbers #75

Merged
merged 4 commits into from
May 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ jobs:
ZENROWS_API_KEY: ${{ secrets.ZENROWS_API_KEY }}

build-and-push:
if: ${{ github.event_name == 'push' }}
needs: test
runs-on: ubuntu-22.04
outputs:
Expand Down Expand Up @@ -99,6 +100,7 @@ jobs:
run: docker push --all-tags socialchangelab/media-impact-monitor

deploy:
if: ${{ github.event_name == 'push' }}
needs: build-and-push
runs-on: ubuntu-22.04
steps:
Expand Down
32 changes: 30 additions & 2 deletions backend-python/media_impact_monitor/data_loaders/protest/acled.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@

import pandas as pd
from dotenv import load_dotenv
from media_impact_monitor.data_loaders.protest.acled_size import (
get_size_number,
get_size_text,
)
from media_impact_monitor.util.cache import cache, get
from media_impact_monitor.util.date import verify_dates

Expand Down Expand Up @@ -58,7 +62,7 @@ def get_acled_events(
"event_type": "Protests",
"event_date": f"{start_date.strftime('%Y-%m-%d')}|{end_date.strftime('%Y-%m-%d')}",
"event_date_where": "BETWEEN",
"fields": "event_date|assoc_actor_1|notes",
"fields": "event_date|sub_event_type|assoc_actor_1|country|admin1|admin2|notes|tags",
"limit": limit,
}
assert (countries or regions) and not (
Expand All @@ -77,6 +81,30 @@ def get_acled_events(
if len(df) == limit:
raise ValueError(f"Limit of {limit} reached.")
df["date"] = pd.to_datetime(df["event_date"], format="%Y-%m-%d")
df["region"] = df["admin1"]
df["city"] = df["admin2"]
df["organizations"] = df["assoc_actor_1"].str.split("; ")
df["type"] = df["sub_event_type"]
df["size_text"] = df["tags"].apply(get_size_text)
df["size_number"] = df["size_text"].apply(get_size_number)
df["description"] = df["notes"]
return df[["date", "description", "organizations"]]
return df[
[
"date",
"type",
"organizations",
"country",
"region",
"city",
"size_text",
"size_number",
"description",
]
]


data = get_acled_events(
countries=["Germany"],
start_date=date(2020, 1, 1),
end_date=date(2020, 3, 30),
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import re

from number_parser import parse_number


def get_size_text(acled_string: str) -> str | None:
# match string parts like "crowd size=around 160"
match = re.search(r"(crowd )?size=(.*)", acled_string)
if match:
size_specifier = match.group(2).strip()
return size_specifier
return None


def get_size_number(size_text: str) -> int | None:
try:
return int(size_text)
except ValueError:
pass
try:
return int(float(size_text))
except ValueError:
pass
if str(size_text) in ["None", "na", "nan", "", "no report", "no reports"]:
return None
# remove comma from 500,000; 1,500; etc.
size_text = re.sub(r"(\d+),(\d+)", r"\1\2", size_text)
# match string parts like "between 100 and 200", "100-200", etc.
# if there are multiple numbers, take the mean
multi_match = re.search(r"(\d+)\D+(\d+)", size_text)
if multi_match:
return (int(multi_match.group(1)) + int(multi_match.group(2))) // 2
# match string parts like "around 100", "100", etc.
single_match = re.search(r"(\d+)", size_text)
if single_match:
return int(single_match.group(1))
size_text = (
size_text.removesuffix(" tractors")
.removesuffix(" cars")
.removesuffix(" bicycles")
.removesuffix(" vehicles")
.removesuffix(" people")
.removesuffix(" of")
)
size_text = (
size_text.removeprefix("around ")
.removeprefix("about ")
.removeprefix("approximately ")
.removeprefix("at least ")
.removeprefix("at most ")
.removeprefix("up to ")
.removeprefix("more than ")
.removeprefix("over ")
.removeprefix("less than ")
.removeprefix("fewer than ")
.removeprefix("under ")
.removeprefix("nearly ")
)
if size_text in [
"several",
"a handful",
"a few",
"some",
"a group",
"a small group",
"small group",
"a couple",
"half dozen",
"half-dozen",
"half a dozen",
]:
return 5
size_text = (
size_text.removeprefix("several ")
.removeprefix("a ")
.removeprefix("few ")
.removeprefix("couple ")
)
if size_text in ["dozens", "dozen", "big group", "large group"]:
return 50
if size_text in ["hundreds", "hundred"]:
return 500
if size_text in ["thousands", "thousand"]:
return 5000
if size_text in ["tens of thousands"]:
return 50_000
if size_text in ["hundreds of thousands"]:
return 500_000
if size_text.endswith("dozen"):
num_dozens = parse_number(size_text[:-6])
if num_dozens:
return num_dozens * 12
parsed = parse_number(size_text)
return parsed or None
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import pytest
from media_impact_monitor.data_loaders.protest.acled_size import get_size_number


@pytest.mark.parametrize(
"size_text, expected_number",
[
("100", 100),
("1000", 1000),
("500,000", 500000),
("1,500", 1500),
("between 100 and 200", 150),
("100-200", 150),
("around 100", 100),
("about 500", 500),
("approximately 1000", 1000),
("at least 200", 200),
("at most 500", 500),
("up to 1000", 1000),
("more than 500", 500),
("over 1000", 1000),
("less than 100", 100),
("fewer than 50", 50),
("under 200", 200),
("nearly 500", 500),
("several", 5),
("a handful", 5),
("a few", 5),
("some", 5),
("a group", 5),
("a small group", 5),
("small group", 5),
("a couple", 5),
("half dozen", 5),
("half-dozen", 5),
("half a dozen", 5),
("dozens", 50),
("a dozen", 50), # TODO
("big group", 50),
("large group", 50),
("hundreds", 500),
("one hundred", 100),
("a couple hundred", 500),
("thousands", 5000),
("one thousand", 1000),
("a few thousand", 5000),
("tens of thousands", 50000),
("hundreds of thousands", 500000),
("two dozen", 24),
("five dozen", 60),
("twenty five", 25),
("None", None),
("na", None),
("nan", None),
("", None),
("no report", None),
("no reports", None),
],
)
def test_get_size_number(size_text, expected_number):
assert get_size_number(size_text) == expected_number


def test_get_size_number_invalid_input():
assert get_size_number("invalid") is None
8 changes: 8 additions & 0 deletions backend-python/media_impact_monitor/types_.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,17 @@ class Event:
source: EventSource = Field(description="The source dataset.")
topic: Topic = Field(description="The topic of the event.")
date: date_ = Field(description="The date of the event.")
country: str = Field(description="The country where the event took place.")
region: str = Field(description="The region where the event took place.")
city: str = Field(description="The city where the event took place.")
organizations: list[str] = Field(
description="The organizations involved in the event."
)
type_: str = Field(description="Type of the event: ...")
size_text: str = Field(description="Size of the event, in words.")
size_number: int | None = Field(
description="Size of the event, quantified if possible."
)
description: str = Field(description="Description of the event.")


Expand Down
16 changes: 15 additions & 1 deletion backend-python/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions backend-python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ uvicorn = {extras = ["standard"], version = "^0.29.0"}
pydantic = "^2.6.4"
requests = "^2.31.0"
websockets = "^12.0"
number-parser = "^0.3.2"

[tool.poetry.group.dev.dependencies]
pytest = "^8.0.2"
Expand Down
Loading