Skip to content

Commit

Permalink
pair rarity
Browse files Browse the repository at this point in the history
  • Loading branch information
wowtor committed Dec 1, 2023
1 parent 11c72a3 commit 0c90ca8
Show file tree
Hide file tree
Showing 7 changed files with 92 additions and 66 deletions.
4 changes: 3 additions & 1 deletion data_analysis/applications/tracks_and_pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from data_analysis.utils import get_layers, get_tracks_pairs_from_csv, \
load_measurements_to_df, get_colormap, get_html_color_legend, \
get_switches_and_rarest_pairs, find_date_range, map_ts_to_day_beginning_at_5am
from telcell.utils.transform import categorize_measurement_by_coordinates


class TrackDashboard:
Expand All @@ -28,7 +29,8 @@ def app(self):
# read the data
registrations_df = load_measurements_to_df(self.file_name, map_ts_to_day_beginning_at_5am)
track_pairs = get_tracks_pairs_from_csv(self.file_name)
registration_pairs = get_switches_and_rarest_pairs(track_pairs, self.max_delay)
registration_pairs = get_switches_and_rarest_pairs(track_pairs,
categorize_measurement_for_rarity=categorize_measurement_by_coordinates, max_delay=self.max_delay)

# Assign a color to each device/owner combination
colormap = get_colormap(['tab10', 'Set3', 'Dark2', 'tab20b'])
Expand Down
14 changes: 7 additions & 7 deletions data_analysis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
from telcell.data.models import Track
from telcell.data.parsers import parse_measurements_csv
from telcell.utils.transform import get_switches, \
sort_pairs_based_on_rarest_location, slice_track_pairs_to_intervals, \
create_track_pairs
slice_track_pairs_to_intervals, \
create_track_pairs, get_pair_with_rarest_measurement_b

rd_to_wgs84 = Transformer.from_crs(crs_from="EPSG:28992", crs_to="EPSG:4326")
GEOD = pyproj.Geod(ellps='WGS84')
Expand Down Expand Up @@ -204,36 +204,36 @@ def get_tracks_pairs_from_csv(file_name: str) -> List[Tuple[Track, Track, Mappin


def get_switches_and_rarest_pairs(data: List[Tuple[Track, Track, Mapping[str, Any]]],
categorize_measurement_for_rarity: Callable,
max_delay: int = None) -> pd.DataFrame:
"""
Load all registration pairs from pairs of tracks, store them in a
dataframe. We also indicate for each day what pair is selected based on
rarest location and maximum time difference (if any).
:param data: the paired tracks for each time interval.
:param categorize_measurement_for_rarity: a categorization function of measurements to be used to determine rarity
:param max_delay: maximum time difference (seconds) of a registration pair.
Default: no max_delay, all pairs are returned.
:return: a dataframe of paired measurements.
"""
df = []
for track_a, track_b, kwargs in tqdm(data):
switches = get_switches(track_a, track_b)
sorted_pairs_by_rarity_b = sort_pairs_based_on_rarest_location(
rarity, rarest_pair = get_pair_with_rarest_measurement_b(
switches=switches,
history_track_b=kwargs['background_b'],
round_lon_lats=False,
categorize_measurement_for_rarity=categorize_measurement_for_rarity,
max_delay=max_delay)

sorted_pairs_by_rarity_b = {pair: count for count, pair in sorted_pairs_by_rarity_b}
# we want the pair with rarest location, since the dict is sorted, we take the first.
rarest_pair = list(sorted_pairs_by_rarity_b.keys())[0] if sorted_pairs_by_rarity_b else None
device_owner_a, device_owner_b = track_a.device + "_" + track_a.owner, track_b.device + "_" + track_b.owner

df.extend([
[kwargs['interval'][0],
*pair.measurement_a.latlon, pair.measurement_a.timestamp, device_owner_a,
*pair.measurement_b.latlon, pair.measurement_b.timestamp, device_owner_b,
pair.distance, pair.time_difference.seconds,
pair == rarest_pair, sorted_pairs_by_rarity_b.get(pair)]
pair == rarest_pair, rarity, ]
for pair in switches
])

Expand Down
8 changes: 5 additions & 3 deletions telcell/models/rare_pair_feature_based.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from itertools import groupby
from typing import Sequence, Mapping, Optional, List, Any, Tuple
from typing import Sequence, Mapping, Optional, List, Any, Tuple, Callable

from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression
Expand Down Expand Up @@ -28,8 +28,10 @@ class RarePairModel(Model):
"""

def __init__(self, coverage_training_data: Sequence[CoverageData], transformer: BaseTransformer, bins: List[Bin],
categorize_measurement_for_rarity: Callable,
priors: Mapping, fit_models: bool = True):
self.bins = bins
self.categorize_measurement_for_rarity = categorize_measurement_for_rarity
self.transformer = transformer
self.coverage_training_data = self.filter_timediff(coverage_training_data)
self.fit_models = fit_models
Expand All @@ -50,8 +52,8 @@ def predict_lr(self, track_a: Track, track_b: Track, **kwargs) \
if not track_a or not track_b:
return None, None
switches = get_switches(track_a, track_b)
sorted_pairs = sort_pairs_based_on_rarest_location(switches=switches, history_track_b=kwargs['background_b'],
round_lon_lats=False, max_delay=self.max_delay)
rarest_pair = sort_pairs_based_on_rarest_location(switches=switches, history_track_b=kwargs['background_b'],
categorize_measurement_for_rarity=self.categorize_measurement_for_rarity, max_delay=self.max_delay)
if not sorted_pairs:
return None, None
_, rarest_pair = sorted_pairs[0] # we want the pair with the rarest location
Expand Down
11 changes: 6 additions & 5 deletions telcell/models/simplemodel.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Tuple, Optional, Mapping
from typing import List, Tuple, Optional, Mapping, Callable

import lir
import numpy as np
Expand All @@ -8,7 +8,7 @@
from telcell.data.models import Track
from telcell.models import Model
from telcell.utils.transform import get_switches, select_colocated_pairs, generate_all_pairs, \
sort_pairs_based_on_rarest_location
sort_pairs_based_on_rarest_location, get_pair_with_rarest_measurement_b


class MeasurementPairClassifier(Model):
Expand All @@ -21,14 +21,15 @@ class MeasurementPairClassifier(Model):
scores that are provided by the logistic regression.
"""

def __init__(self, colocated_training_data: List[Track]):
def __init__(self, colocated_training_data: List[Track], categorize_measurement_for_rarity: Callable):
self.training_data = colocated_training_data
self.colocated_training_pairs = select_colocated_pairs(self.training_data)
self.categorize_measurement_for_rarity = categorize_measurement_for_rarity

def predict_lr(self, track_a: Track, track_b: Track, **kwargs) -> Tuple[float, Optional[Mapping]]:
pairs = get_switches(track_a, track_b)
pair = sort_pairs_based_on_rarest_location(switches=pairs, history_track_b=kwargs['background_b'],
round_lon_lats=True)[0][1]
_, pair = get_pair_with_rarest_measurement_b(switches=pairs, history_track_b=kwargs['background_b'],
categorize_measurement_for_rarity=self.categorize_measurement_for_rarity)

# resulting pairs need not be really dislocated, but simulated
# dislocation by temporally shifting track a's history towards the
Expand Down
68 changes: 47 additions & 21 deletions telcell/utils/transform.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from collections import Counter, defaultdict
from datetime import datetime, timedelta, time
from itertools import combinations
from typing import Iterator, Tuple, Mapping, Any, List
from typing import Iterator, Tuple, Mapping, Any, List, Callable

from more_itertools import pairwise

Expand Down Expand Up @@ -115,53 +115,79 @@ def filter_delay(paired_measurements: List[MeasurementPair],
if x.time_difference <= max_delay]


def sort_pairs_based_on_rarest_location(
def categorize_measurement_by_coordinates(measurement: Measurement) -> Any:
return f'{measurement.lon}_{measurement.lat}'


def categorize_measurement_by_rounded_coordinates(measurement: Measurement) -> Any:
return f'{measurement.lon:.2f}_{measurement.lat:.2f}'


def get_pair_with_rarest_measurement_b(
switches: List[MeasurementPair],
history_track_b: Track,
categorize_measurement_for_rarity: Callable,
max_delay: int = None
) -> Tuple[int, MeasurementPair]:
"""
Pairs are first filtered on allowed time interval of the two registrations
of a single pair. Then, sort pairs based on the rarity of the measurement
with respect to `categorize_measurements` and secondarily by time
difference of the pair. The first pair is returned.
:param switches: A list with all paired measurements to consider.
:param history_track_b: the history of track_b to find the rarity of locations.
:param categorize_measurement_for_rarity: callable which returns a category specification
of a measurement in order to determine its rarity
:param max_delay: maximum allowed time difference (seconds) in a pair.
Default: no max_delay, show all possible pairs.
:return: The category counts and measurement pairs that are sorted on the
rarest location based on the history and time difference. The
category count is the number of occurrences of the category from
measurement_b in the track history that is provided.
"""
sorted_pairs = _sort_pairs_based_on_rarest_location(switches, history_track_b, categorize_measurement_for_rarity, max_delay)
assert len(sorted_pairs) > 0
return sorted_pairs[0]


def _sort_pairs_based_on_rarest_location(
switches: List[MeasurementPair],
history_track_b: Track,
round_lon_lats: bool,
categorize_measurement_for_rarity: Callable,
max_delay: int = None
) -> List[Tuple[int, MeasurementPair]]:
"""
Pairs are first filtered on allowed time interval of the two registrations
of a single pair. Then, sort pairs based on the rarest location of the
track history first and secondly by time difference of the pair.
of a single pair. Then, sort pairs based on the rarity of the measurement
with respect to `categorize_measurements` and secondarily by time
difference of the pair. The first pair is returned, or None if `switches`
is an empty list.
:param switches: A list with all paired measurements to consider.
:param history_track_b: the history of track_b to find the rarity of locations.
:param round_lon_lats: boolean indicating whether to round the lon/lats
to two decimals.
:param categorize_measurement_for_rarity: callable which returns a category specification
of a measurement in order to determine its rarity
:param max_delay: maximum allowed time difference (seconds) in a pair.
Default: no max_delay, show all possible pairs.
:return: The location counts and measurement pairs that are sorted on the
rarest location based on the history and time difference. The
location count is the number of occurrences of the coordinates from
measurement_b in the track history that is provided.
TODO There is a problem with testdata, because those are almost continuous
lat/lon data, making rarity of locations not as straightforward.
Pseudo-solution for now: round lon/lats to two decimals and determine
rarity of those.
This should not be used if locations are actual cell-ids
"""

def location_key(measurement):
if round_lon_lats:
return f'{measurement.lon:.2f}_{measurement.lat:.2f}'
else:
return f'{measurement.lon}_{measurement.lat}'

def sort_key(element):
rarity, pair = element
return rarity, pair.time_difference

location_counts = Counter(
location_key(m) for m in history_track_b.measurements)
categorize_measurement_for_rarity(m) for m in history_track_b.measurements)

if max_delay:
switches = filter_delay(switches, timedelta(seconds=max_delay))

sorted_pairs = sorted(
((location_counts.get(location_key(pair.measurement_b), 0), pair) for
((location_counts.get(categorize_measurement_for_rarity(pair.measurement_b), 0), pair) for
pair in switches), key=sort_key)

return sorted_pairs
Expand Down
11 changes: 6 additions & 5 deletions tests/models/test_simplemodel.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from datetime import timedelta, datetime, timezone
from datetime import timedelta

from telcell.data.parsers import parse_measurements_csv
from telcell.utils.transform import get_switches, filter_delay, sort_pairs_based_on_rarest_location
from telcell.utils.transform import get_switches, filter_delay, \
get_pair_with_rarest_measurement_b, categorize_measurement_by_rounded_coordinates


def test_simplemodel(testdata_3days_path):
Expand All @@ -16,9 +17,9 @@ def test_simplemodel(testdata_3days_path):

assert len(paired_measurements) == len(filtered_measurement_pairs)

rarest_measurement_pair = \
sort_pairs_based_on_rarest_location(
_, rarest_measurement_pair = \
get_pair_with_rarest_measurement_b(
filtered_measurement_pairs,
track_b,
round_lon_lats=True)[0][1]
categorize_measurement_for_rarity=categorize_measurement_by_rounded_coordinates)
assert rarest_measurement_pair
42 changes: 18 additions & 24 deletions tests/utils/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

from telcell.data.models import Measurement, Track, Point
from telcell.utils.transform import get_switches, create_track_pairs, \
is_colocated, sort_pairs_based_on_rarest_location, MeasurementPair
is_colocated, MeasurementPair, categorize_measurement_by_coordinates, _sort_pairs_based_on_rarest_location, \
categorize_measurement_by_rounded_coordinates


def test_get_switches(test_data_3days):
Expand Down Expand Up @@ -69,10 +70,10 @@ def test_sort_by_time_diff_for_same_location_rarity(max_delay):
]
background_b = Track('', '',
[s.measurement_b for s in switches_one_location])
sorted_pairs = sort_pairs_based_on_rarest_location(switches_one_location,
sorted_pairs = _sort_pairs_based_on_rarest_location(switches_one_location,
background_b,
False,
max_delay)
categorize_measurement_for_rarity=categorize_measurement_by_coordinates,
max_delay=max_delay)

# manually check correct timestamps of measurement_b
correct_timestamps_b = [1, 4, 4, 5, 8, 8, 9, 21, 24, 24]
Expand Down Expand Up @@ -102,10 +103,8 @@ def test_sort_by_location_rarity_for_same_time_diff(max_delay):
]
background_b = Track('', '',
[s.measurement_b for s in switches_one_time_diff])
sorted_pairs = sort_pairs_based_on_rarest_location(switches_one_time_diff,
background_b,
False,
max_delay)
sorted_pairs = _sort_pairs_based_on_rarest_location(switches_one_time_diff, background_b,
categorize_measurement_for_rarity=categorize_measurement_by_coordinates, max_delay=max_delay)

# manually check correct location counts and longitude of measurement_b
correct_counts_lon_b = [(1, 3.0), (1, 4.0), (3, 2.0), (3, 2.0), (3, 2.0),
Expand All @@ -122,10 +121,8 @@ def test_sort_by_location_rarity_for_same_time_diff(max_delay):

background_a = Track('', '',
[s.measurement_a for s in switches_one_time_diff])
sorted_pairs = sort_pairs_based_on_rarest_location(switches_one_time_diff,
background_a,
False,
max_delay)
sorted_pairs = _sort_pairs_based_on_rarest_location(switches_one_time_diff, background_a,
categorize_measurement_for_rarity=categorize_measurement_by_coordinates, max_delay=max_delay)
# with a wrong background that has no intersection with 'track_b',
# the counts should be zero
assert all(count == 0 for count, _ in sorted_pairs)
Expand All @@ -144,10 +141,9 @@ def test_sort_outside_bin(max_delay):
for _ in range(10)
]
background_b = Track('', '', [s.measurement_b for s in switches])
sorted_pairs = sort_pairs_based_on_rarest_location(switches,
background_b,
False,
max_delay)
sorted_pairs = _sort_pairs_based_on_rarest_location(switches, background_b,
categorize_measurement_for_rarity=categorize_measurement_by_coordinates,
max_delay=max_delay)

assert len(sorted_pairs) == 5 # some pairs with too large time difference are filtered
assert max(s[1].time_difference for s in sorted_pairs).seconds == 120
Expand All @@ -165,17 +161,15 @@ def test_round_lat_lons(max_delay):
for i in range(10)
]
background_b = Track('', '', [s.measurement_b for s in switches])
sorted_pairs = sort_pairs_based_on_rarest_location(switches,
background_b,
False,
max_delay)
sorted_pairs = _sort_pairs_based_on_rarest_location(switches, background_b,
categorize_measurement_for_rarity=categorize_measurement_by_coordinates,
max_delay=max_delay)
# without rounding, we have unique locations
assert all(count == 1 for count, _ in sorted_pairs)

sorted_pairs = sort_pairs_based_on_rarest_location(switches,
background_b,
True,
max_delay)
sorted_pairs = _sort_pairs_based_on_rarest_location(switches, background_b,
categorize_measurement_for_rarity=categorize_measurement_by_rounded_coordinates,
max_delay=max_delay)
# with rounding, the locations are identical
assert all(count == 10 for count, _ in sorted_pairs)

0 comments on commit 0c90ca8

Please sign in to comment.