Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

handle dates and timedeltas in aggregators and DescribeSheet #2380

Open
wants to merge 9 commits into
base: develop
Choose a base branch
from
63 changes: 53 additions & 10 deletions visidata/aggregators.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import collections
import statistics
from copy import copy
import datetime

from visidata import Progress, Sheet, Column, ColumnsSheet, VisiData
from visidata import vd, anytype, vlen, asyncthread, wrapply, AttrDict, date, INPROGRESS
Expand Down Expand Up @@ -105,10 +106,46 @@ def aggregator(vd, name, funcValues, helpstr='', *, type=None):
def mean(vals):
vals = list(vals)
if vals:
return float(sum(vals))/len(vals)
if type(vals[0]) is date:
vals = [d.timestamp() for d in vals]
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think these special cases belong in the aggregators. The idea is that people can define their own aggregators with 'obvious' implementations and they should work reasonably. (and regardless, where these checks are, they should use isinstance with the Python stdlib datetime as you're doing below).

To that end, the visidata date and datedelta classes both already have __float__ methods which convert them to float as you're doing here. So maybe the aggregator callsite can call aggregator.intype on each value, and for these "more numeric" aggregators would set their intype to float. Then they would need to convert the float result back into some result type, which as you've noted below can be tricky for date formats.

ans = float(sum(vals))/len(vals)
return datetime.date.fromtimestamp(ans)
elif isinstance(vals[0], datetime.timedelta):
return datetime.timedelta(seconds=vsum(vals)/datetime.timedelta(seconds=len(vals)))
else:
return float(sum(vals))/len(vals)

def vsum(vals):
return sum(vals, start=type(vals[0] if len(vals) else 0)()) #1996
if vals:
if type(vals[0]) is date:
vd.error('dates cannot be summed')
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would be a vd.fail as it's more of a user error. But it should also be outside the aggregation function.

return None
return sum(vals, start=type(vals[0])()) #1996
else:
return 0

def median(vals):
if not vals:
return None
if type(vals[0]) is date:
# when the length is even, statistics.median needs to add
# two midpoints to average them, so convert to timestamps
vals = [d.timestamp() for d in vals]
return datetime.date.fromtimestamp(statistics.median(vals))
return statistics.median(vals)

def stdev(vals):
if vals and len(vals) >= 2:
if type(vals[0]) is date:
vals = [d.timestamp() for d in vals]
return datetime.timedelta(seconds=statistics.stdev(vals))
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is very interesting, I wonder how we can codify that the output type of stdev for both date and datedelta are datedelta. It's not quite as simple as making the result type datedelta and passing it the float from the generic aggregations, since Python's timedelta (which datedelta is a subclass of) has a constructor with a first parameter of 'days' instead of 'seconds'.

elif isinstance(vals[0], datetime.timedelta):
vals = [d.total_seconds() for d in vals]
return datetime.timedelta(seconds=statistics.stdev(vals))
return statistics.stdev(vals)
else:
vd.error('stdev requires at least two data points')
return None

# http://code.activestate.com/recipes/511478-finding-the-percentile-of-the-values/
def _percentile(N, percent, key=lambda x:x):
Expand Down Expand Up @@ -146,17 +183,17 @@ def quantiles(q, helpstr):
return [PercentileAggregator(round(100*i/q), helpstr) for i in range(1, q)]


vd.aggregator('min', min, 'minimum value')
vd.aggregator('max', max, 'maximum value')
vd.aggregator('avg', mean, 'arithmetic mean of values', type=float)
vd.aggregator('mean', mean, 'arithmetic mean of values', type=float)
vd.aggregator('median', statistics.median, 'median of values')
vd.aggregator('min', min, 'minimum value', type=anytype)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are min/max forcibly set to anytype instead of letting them take the type of their source column?

I understand why avg/mean/median want to be more generic; maybe this is where we have both an 'input type' which remains float for these, and a 'result type' which is more variable.

vd.aggregator('max', max, 'maximum value', type=anytype)
vd.aggregator('avg', mean, 'arithmetic mean of values', type=anytype)
vd.aggregator('mean', mean, 'arithmetic mean of values', type=anytype)
vd.aggregator('median', median, 'median of values', type=anytype)
vd.aggregator('mode', statistics.mode, 'mode of values')
vd.aggregator('sum', vsum, 'sum of values')
vd.aggregator('sum', vsum, 'sum of values', type=anytype)
vd.aggregator('distinct', set, 'distinct values', type=vlen)
vd.aggregator('count', lambda values: sum(1 for v in values), 'number of values', type=int)
vd.aggregator('list', list, 'list of values', type=anytype)
vd.aggregator('stdev', statistics.stdev, 'standard deviation of values', type=float)
vd.aggregator('stdev', stdev, 'standard deviation of values', type=anytype)

vd.aggregators['q3'] = quantiles(3, 'tertiles (33/66th pctile)')
vd.aggregators['q4'] = quantiles(4, 'quartiles (25/50/75th pctile)')
Expand Down Expand Up @@ -236,14 +273,20 @@ def _aggregateTotalAsync(col, agg):
@asyncthread
def memo_aggregate(col, agg_choices, rows):
'Show aggregated value in status, and add to memory.'
if not rows:
vd.fail('no rows to aggregate')
for agg_choice in agg_choices:
agg = vd.aggregators.get(agg_choice)
if not agg: continue
aggs = agg if isinstance(agg, list) else [agg]
for agg in aggs:
aggval = agg.aggregate(col, rows)
typedval = wrapply(agg.type or col.type, aggval)
dispval = col.format(typedval)
if agg.name == 'stdev' and (col.type is date):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to figure out how to not have a very special type check here.

# col type is a date, but typedval is a timedelta
dispval = str(typedval)
else:
dispval = col.format(typedval)
k = col.name+'_'+agg.name
vd.status(f'{k}={dispval}')
vd.memory[k] = typedval
Expand Down
23 changes: 12 additions & 11 deletions visidata/features/describe.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from copy import copy
from statistics import mode, median, mean, stdev
from statistics import mode
import datetime

from visidata import vd, Column, ColumnAttr, vlen, RowColorizer, asyncthread, Progress, wrapply
from visidata import vd, Column, ColumnAttr, vlen, RowColorizer, asyncthread, Progress, wrapply, anytype, date
from visidata import BaseSheet, TableSheet, ColumnsSheet, SheetsSheet


vd.option('describe_aggrs', 'mean stdev', 'numeric aggregators to calculate on Describe sheet', help=vd.help_aggregators)
vd.option('describe_aggrs', 'min max sum median mean stdev', 'numeric aggregators to calculate on Describe sheet', help=vd.help_aggregators)


@Column.api
Expand Down Expand Up @@ -44,10 +45,6 @@ class DescribeSheet(ColumnsSheet):
DescribeColumn('nulls', type=vlen),
DescribeColumn('distinct',type=vlen),
DescribeColumn('mode', type=str),
DescribeColumn('min', type=str),
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any functional reason to moving these into describe_aggrs? I think this must be why min/max types were changed to anytype (and maybe they should not be str here in the original code), so maybe this is not necessary.

DescribeColumn('max', type=str),
DescribeColumn('sum'),
DescribeColumn('median', type=str),
]
colorizers = [
RowColorizer(7, 'color_key_col', lambda s,c,r,v: r and r in r.sheet.keyCols),
Expand All @@ -61,7 +58,8 @@ def loader(self):
self.resetCols()

for aggrname in vd.options.describe_aggrs.split():
self.addColumn(DescribeColumn(aggrname, type=float))
aggrtype = vd.aggregators[aggrname].type
self.addColumn(DescribeColumn(aggrname, type=aggrtype))

for srccol in Progress(self.rows, 'categorizing'):
if not srccol.hidden:
Expand All @@ -87,12 +85,15 @@ def reloadColumn(self, srccol):
d['distinct'].add(v)
except Exception as e:
d['errors'].append(sr)
if not vals:
return

d['mode'] = self.calcStatistic(d, mode, vals)
if vd.isNumeric(srccol):
for func in [min, max, sum, median]: # use type
d[func.__name__] = self.calcStatistic(d, func, vals)
if vd.isNumeric(srccol) or \
isinstance(vals[0], (datetime.timedelta, datetime.date)):
for aggrname in vd.options.describe_aggrs.split():
if aggrname == 'sum' and (srccol.type is date or isinstance(vals[0], datetime.date)):
continue
aggr = vd.aggregators[aggrname].funcValues
d[aggrname] = self.calcStatistic(d, aggr, vals)

Expand Down
Loading