-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathidentify_partial_borrowers.py
193 lines (163 loc) · 7.74 KB
/
identify_partial_borrowers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# identify members with partial borrowing history
import os.path
from statistics import mean
import sys
import pandas as pd
sys.path.append("..")
from data.dataset import get_shxco_data, DATA_DIR
# Specify the path to the data directory
PARTIAL_BORROWERS_CSV = os.path.join(DATA_DIR, "partial_borrowers.csv")
PARTIAL_BORROWERS_COLLAPSED_CSV = os.path.join(
DATA_DIR, "partial_borrowers_collapsed.csv"
)
def get_partial_borrowers() -> pd.DataFrame:
"""Function to load partial borrowers csv and return as dataframe
Returns:
pd.DataFrame: dataframe of members with partial borrowing history
"""
# convenience method to load partial borrowers csv and return as dataframe
return pd.read_csv(PARTIAL_BORROWERS_CSV)
def collapse_partial_borrowers(partial_borrows: pd.DataFrame) -> None:
"""Function to collapse sequential subscription periods for each member
Parameters:
partial_borrows (pd.DataFrame): dataframe of members with partial borrowing history
Returns:
None
"""
# collapse sequential periods into single time periods
collapsed_ranges = []
# convert to datetime for comparison
partial_borrows["start_date"] = pd.to_datetime(
partial_borrows.subscription_start, format="%Y-%m-%d", errors="coerce"
)
partial_borrows["end_date"] = pd.to_datetime(
partial_borrows.subscription_end, format="%Y-%m-%d", errors="coerce"
)
# allowed gap between sequential periods
# ?? what amount of time is reasonable here?
allowed_gap = pd.Timedelta(days=90)
for member_id in partial_borrows.member_id.unique():
range_start = None
range_end = None
# get the previously identified subscriptions with no borrowing
member_subs = partial_borrows[partial_borrows.member_id == member_id]
for sub in member_subs.itertuples():
# if we have an existing range, check if this one overlaps,
# or start date follows within the allowed gap
if range_start and (
sub.start_date <= range_end or sub.start_date - range_end <= allowed_gap
):
# extend the current range to the later of the two end dates
# (overlapping supplement could end before subscription)
range_end = max(range_end, sub.end_date)
# store the size of the gap between the ranges, if any
internal_gaps.append(max(0, (sub.start_date - range_end).days))
# if no overlap, save the current range (if any) and start a new one
else:
# if there is an existing range, save it
if range_start:
collapsed_ranges.append(
{
"member_id": member_id,
# TODO: format dates for export
"subscription_start": range_start,
"subscription_end": range_end,
# sequence of subscription events: [subscription, renew, supplement]
"subscription_events": ";".join(events),
"subscription_volumes": mean(volumes),
"subscription_days": (range_end - range_start).days,
"internal_gaps": ";".join(
[str(gap) for gap in internal_gaps]
),
# preserve members' total known borrows
"known_borrows": sub.known_borrows,
}
)
# start a new range based on the current subscription
range_start = sub.start_date
range_end = sub.end_date
events = []
volumes = []
internal_gaps = []
# keep track of subscription events and volumes
events.append(sub.subscription_type)
volumes.append(sub.subscription_volumes)
# save the last range
if range_start and range_end:
collapsed_ranges.append(
{
"member_id": member_id,
"subscription_start": range_start,
"subscription_end": range_end,
"subscription_events": ";".join(events),
"subscription_volumes": mean(volumes),
"subscription_days": (range_end - range_start).days,
"internal_gaps": ";".join([str(gap) for gap in internal_gaps]),
"known_borrows": sub.known_borrows,
}
)
# convert to dataframe and save as csv for use elsewhere
pd.DataFrame(data=collapsed_ranges).to_csv(
PARTIAL_BORROWERS_COLLAPSED_CSV, index=False
)
def identify_partial_borrowers() -> None:
"""Function to identify members with partial borrowing history
Returns:
None
"""
_, _, events_df = get_shxco_data()
date_events = events_df.copy()
date_events["start_datetime"] = pd.to_datetime(
date_events.start_date, format="%Y-%m-%d", errors="coerce"
)
date_events["end_datetime"] = pd.to_datetime(
date_events.end_date, format="%Y-%m-%d", errors="coerce"
)
# filter to subscription events with known start and end date
subscription_events = date_events[
date_events.event_type.isin(["Subscription", "Renewal", "Supplement"])
& date_events.start_datetime.notna()
& date_events.end_datetime.notna()
]
# get all book events (anything with an item uri, ignore event type)
# [strictly speaking should we restrict to borrows?]
book_events = date_events[date_events.item_uri.notna()]
partial_borrowers = []
# look over subscriptions for each member with book events
for member_id in book_events.member_id.unique():
# filter to all subscription and book events for this member
# sort subscription events by start date so we can collapse sequential periods
member_subs = subscription_events[
subscription_events.member_id == member_id
].sort_values("start_date")
member_book_events = book_events[book_events.member_id == member_id]
# check each subscription for any overlapping book events
for sub in member_subs.itertuples():
# NOTE: ignoring unknown end dates
# look for book events that overlap with the subscription dates
sub_book_events = member_book_events[
(sub.start_datetime <= member_book_events.end_datetime)
& (sub.end_datetime >= member_book_events.start_datetime)
]
# if there are no book events within this subscription,
# add it to the list of partial borrower dates
if sub_book_events.empty:
# print('\n%s' % member_id)
# print(f"{sub.start_date} - {sub.end_date} ({sub.event_type}, {sub.subscription_volumes} volumes)")
partial_borrowers.append(
{
"member_id": member_id,
"subscription_start": sub.start_date,
"subscription_end": sub.end_date,
"subscription_type": sub.event_type,
"subscription_volumes": sub.subscription_volumes,
"known_borrows": len(member_book_events.index),
}
)
# load into dataframe and save as csv for use elsewhere
partial_borrows = pd.DataFrame(data=partial_borrowers)
partial_borrows.to_csv(PARTIAL_BORROWERS_CSV, index=False)
# now collapse sequential-ish subscription periods for each member
collapse_partial_borrowers(partial_borrows)
if __name__ == "__main__":
identify_partial_borrowers()