SocialChangeLab · davidpomerenke · May 10, 2024 · Apr 14, 2024 · Apr 14, 2024 · Apr 26, 2024
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -61,6 +61,7 @@ jobs:
         ZENROWS_API_KEY: ${{ secrets.ZENROWS_API_KEY }}
 
   build-and-push:
+    if: ${{ github.event_name == 'push' }}
     needs: test
     runs-on: ubuntu-22.04
     outputs:
@@ -99,6 +100,7 @@ jobs:
       run: docker push --all-tags socialchangelab/media-impact-monitor
 
   deploy:
+    if: ${{ github.event_name == 'push' }}
     needs: build-and-push
     runs-on: ubuntu-22.04
     steps:

diff --git a/backend-python/media_impact_monitor/data_loaders/protest/acled.py b/backend-python/media_impact_monitor/data_loaders/protest/acled.py
@@ -3,6 +3,10 @@
 
 import pandas as pd
 from dotenv import load_dotenv
+from media_impact_monitor.data_loaders.protest.acled_size import (
+    get_size_number,
+    get_size_text,
+)
 from media_impact_monitor.util.cache import cache, get
 from media_impact_monitor.util.date import verify_dates
 
@@ -58,7 +62,7 @@ def get_acled_events(
         "event_type": "Protests",
         "event_date": f"{start_date.strftime('%Y-%m-%d')}|{end_date.strftime('%Y-%m-%d')}",
         "event_date_where": "BETWEEN",
-        "fields": "event_date|assoc_actor_1|notes",
+        "fields": "event_date|sub_event_type|assoc_actor_1|country|admin1|admin2|notes|tags",
         "limit": limit,
     }
     assert (countries or regions) and not (
@@ -77,6 +81,30 @@ def get_acled_events(
     if len(df) == limit:
         raise ValueError(f"Limit of {limit} reached.")
     df["date"] = pd.to_datetime(df["event_date"], format="%Y-%m-%d")
+    df["region"] = df["admin1"]
+    df["city"] = df["admin2"]
     df["organizations"] = df["assoc_actor_1"].str.split("; ")
+    df["type"] = df["sub_event_type"]
+    df["size_text"] = df["tags"].apply(get_size_text)
+    df["size_number"] = df["size_text"].apply(get_size_number)
     df["description"] = df["notes"]
-    return df[["date", "description", "organizations"]]
+    return df[
+        [
+            "date",
+            "type",
+            "organizations",
+            "country",
+            "region",
+            "city",
+            "size_text",
+            "size_number",
+            "description",
+        ]
+    ]
+
+
+data = get_acled_events(
+    countries=["Germany"],
+    start_date=date(2020, 1, 1),
+    end_date=date(2020, 3, 30),
+)
diff --git a/backend-python/media_impact_monitor/data_loaders/protest/acled_size.py b/backend-python/media_impact_monitor/data_loaders/protest/acled_size.py
@@ -0,0 +1,94 @@
+import re
+
+from number_parser import parse_number
+
+
+def get_size_text(acled_string: str) -> str | None:
+    # match string parts like "crowd size=around 160"
+    match = re.search(r"(crowd )?size=(.*)", acled_string)
+    if match:
+        size_specifier = match.group(2).strip()
+        return size_specifier
+    return None
+
+
+def get_size_number(size_text: str) -> int | None:
+    try:
+        return int(size_text)
+    except ValueError:
+        pass
+    try:
+        return int(float(size_text))
+    except ValueError:
+        pass
+    if str(size_text) in ["None", "na", "nan", "", "no report", "no reports"]:
+        return None
+    # remove comma from 500,000; 1,500; etc.
+    size_text = re.sub(r"(\d+),(\d+)", r"\1\2", size_text)
+    # match string parts like "between 100 and 200", "100-200", etc.
+    # if there are multiple numbers, take the mean
+    multi_match = re.search(r"(\d+)\D+(\d+)", size_text)
+    if multi_match:
+        return (int(multi_match.group(1)) + int(multi_match.group(2))) // 2
+    # match string parts like "around 100", "100", etc.
+    single_match = re.search(r"(\d+)", size_text)
+    if single_match:
+        return int(single_match.group(1))
+    size_text = (
+        size_text.removesuffix(" tractors")
+        .removesuffix(" cars")
+        .removesuffix(" bicycles")
+        .removesuffix(" vehicles")
+        .removesuffix(" people")
+        .removesuffix(" of")
+    )
+    size_text = (
+        size_text.removeprefix("around ")
+        .removeprefix("about ")
+        .removeprefix("approximately ")
+        .removeprefix("at least ")
+        .removeprefix("at most ")
+        .removeprefix("up to ")
+        .removeprefix("more than ")
+        .removeprefix("over ")
+        .removeprefix("less than ")
+        .removeprefix("fewer than ")
+        .removeprefix("under ")
+        .removeprefix("nearly ")
+    )
+    if size_text in [
+        "several",
+        "a handful",
+        "a few",
+        "some",
+        "a group",
+        "a small group",
+        "small group",
+        "a couple",
+        "half dozen",
+        "half-dozen",
+        "half a dozen",
+    ]:
+        return 5
+    size_text = (
+        size_text.removeprefix("several ")
+        .removeprefix("a ")
+        .removeprefix("few ")
+        .removeprefix("couple ")
+    )
+    if size_text in ["dozens", "dozen", "big group", "large group"]:
+        return 50
+    if size_text in ["hundreds", "hundred"]:
+        return 500
+    if size_text in ["thousands", "thousand"]:
+        return 5000
+    if size_text in ["tens of thousands"]:
+        return 50_000
+    if size_text in ["hundreds of thousands"]:
+        return 500_000
+    if size_text.endswith("dozen"):
+        num_dozens = parse_number(size_text[:-6])
+        if num_dozens:
+            return num_dozens * 12
+    parsed = parse_number(size_text)
+    return parsed or None
diff --git a/backend-python/media_impact_monitor/data_loaders/protest/acled_size_test.py b/backend-python/media_impact_monitor/data_loaders/protest/acled_size_test.py
@@ -0,0 +1,65 @@
+import pytest
+from media_impact_monitor.data_loaders.protest.acled_size import get_size_number
+
+
+@pytest.mark.parametrize(
+    "size_text, expected_number",
+    [
+        ("100", 100),
+        ("1000", 1000),
+        ("500,000", 500000),
+        ("1,500", 1500),
+        ("between 100 and 200", 150),
+        ("100-200", 150),
+        ("around 100", 100),
+        ("about 500", 500),
+        ("approximately 1000", 1000),
+        ("at least 200", 200),
+        ("at most 500", 500),
+        ("up to 1000", 1000),
+        ("more than 500", 500),
+        ("over 1000", 1000),
+        ("less than 100", 100),
+        ("fewer than 50", 50),
+        ("under 200", 200),
+        ("nearly 500", 500),
+        ("several", 5),
+        ("a handful", 5),
+        ("a few", 5),
+        ("some", 5),
+        ("a group", 5),
+        ("a small group", 5),
+        ("small group", 5),
+        ("a couple", 5),
+        ("half dozen", 5),
+        ("half-dozen", 5),
+        ("half a dozen", 5),
+        ("dozens", 50),
+        ("a dozen", 50),  # TODO
+        ("big group", 50),
+        ("large group", 50),
+        ("hundreds", 500),
+        ("one hundred", 100),
+        ("a couple hundred", 500),
+        ("thousands", 5000),
+        ("one thousand", 1000),
+        ("a few thousand", 5000),
+        ("tens of thousands", 50000),
+        ("hundreds of thousands", 500000),
+        ("two dozen", 24),
+        ("five dozen", 60),
+        ("twenty five", 25),
+        ("None", None),
+        ("na", None),
+        ("nan", None),
+        ("", None),
+        ("no report", None),
+        ("no reports", None),
+    ],
+)
+def test_get_size_number(size_text, expected_number):
+    assert get_size_number(size_text) == expected_number
+
+
+def test_get_size_number_invalid_input():
+    assert get_size_number("invalid") is None
diff --git a/backend-python/media_impact_monitor/types_.py b/backend-python/media_impact_monitor/types_.py
@@ -55,9 +55,17 @@ class Event:
     source: EventSource = Field(description="The source dataset.")
     topic: Topic = Field(description="The topic of the event.")
     date: date_ = Field(description="The date of the event.")
+    country: str = Field(description="The country where the event took place.")
+    region: str = Field(description="The region where the event took place.")
+    city: str = Field(description="The city where the event took place.")
     organizations: list[str] = Field(
         description="The organizations involved in the event."
     )
+    type_: str = Field(description="Type of the event: ...")
+    size_text: str = Field(description="Size of the event, in words.")
+    size_number: int | None = Field(
+        description="Size of the event, quantified if possible."
+    )
     description: str = Field(description="Description of the event.")
 
 

diff --git a/backend-python/poetry.lock b/backend-python/poetry.lock
diff --git a/backend-python/pyproject.toml b/backend-python/pyproject.toml
@@ -28,6 +28,7 @@ uvicorn = {extras = ["standard"], version = "^0.29.0"}
 pydantic = "^2.6.4"
 requests = "^2.31.0"
 websockets = "^12.0"
+number-parser = "^0.3.2"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.0.2"