From e5982e1b4c72f1d2bded965d1d7db3379ff16790 Mon Sep 17 00:00:00 2001 From: midichef <67946319+midichef@users.noreply.github.com> Date: Sat, 24 Feb 2024 18:35:04 -0800 Subject: [PATCH 1/9] [aggr-] allow mean and stdev for dates and timedeltas --- visidata/aggregators.py | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/visidata/aggregators.py b/visidata/aggregators.py index 3b424a5ae..92177567d 100644 --- a/visidata/aggregators.py +++ b/visidata/aggregators.py @@ -4,6 +4,7 @@ import collections import statistics from copy import copy +import datetime from visidata import Progress, Sheet, Column, ColumnsSheet, VisiData from visidata import vd, anytype, vlen, asyncthread, wrapply, AttrDict, date, INPROGRESS @@ -105,11 +106,31 @@ def aggregator(vd, name, funcValues, helpstr='', *, type=None): def mean(vals): vals = list(vals) if vals: - return float(sum(vals))/len(vals) + if type(vals[0]) is date: + vals = [d.timestamp() for d in vals] + ans = float(sum(vals))/len(vals) + return datetime.date.fromtimestamp(ans) + elif isinstance(vals[0], datetime.timedelta): + return datetime.timedelta(seconds=vsum(vals)/datetime.timedelta(seconds=len(vals))) + else: + return float(sum(vals))/len(vals) def vsum(vals): return sum(vals, start=type(vals[0] if len(vals) else 0)()) #1996 +def stdev(vals): + if vals and len(vals) >= 2: + if type(vals[0]) is date: + vals = [d.timestamp() for d in vals] + return datetime.timedelta(seconds=statistics.stdev(vals)) + elif isinstance(vals[0], datetime.timedelta): + vals = [d.total_seconds() for d in vals] + return datetime.timedelta(seconds=statistics.stdev(vals)) + return statistics.stdev(vals) + else: + vd.error('stdev requires at least two data points') + return None + # http://code.activestate.com/recipes/511478-finding-the-percentile-of-the-values/ def _percentile(N, percent, key=lambda x:x): """ @@ -148,15 +169,15 @@ def quantiles(q, helpstr): vd.aggregator('min', min, 'minimum value') vd.aggregator('max', max, 'maximum value') -vd.aggregator('avg', mean, 'arithmetic mean of values', type=float) -vd.aggregator('mean', mean, 'arithmetic mean of values', type=float) +vd.aggregator('avg', mean, 'arithmetic mean of values', type=anytype) +vd.aggregator('mean', mean, 'arithmetic mean of values', type=anytype) vd.aggregator('median', statistics.median, 'median of values') vd.aggregator('mode', statistics.mode, 'mode of values') vd.aggregator('sum', vsum, 'sum of values') vd.aggregator('distinct', set, 'distinct values', type=vlen) vd.aggregator('count', lambda values: sum(1 for v in values), 'number of values', type=int) vd.aggregator('list', list, 'list of values', type=anytype) -vd.aggregator('stdev', statistics.stdev, 'standard deviation of values', type=float) +vd.aggregator('stdev', stdev, 'standard deviation of values', type=anytype) vd.aggregators['q3'] = quantiles(3, 'tertiles (33/66th pctile)') vd.aggregators['q4'] = quantiles(4, 'quartiles (25/50/75th pctile)') @@ -243,7 +264,11 @@ def memo_aggregate(col, agg_choices, rows): for agg in aggs: aggval = agg.aggregate(col, rows) typedval = wrapply(agg.type or col.type, aggval) - dispval = col.format(typedval) + if agg.name == 'stdev' and (col.type is date): + # col type is a date, but typedval is a timedelta + dispval = str(typedval) + else: + dispval = col.format(typedval) k = col.name+'_'+agg.name vd.status(f'{k}={dispval}') vd.memory[k] = typedval From 4ec25f56d2eea7a74217f1b8407e8bb0eab7ec5b Mon Sep 17 00:00:00 2001 From: midichef <67946319+midichef@users.noreply.github.com> Date: Sun, 14 Apr 2024 20:09:38 -0700 Subject: [PATCH 2/9] [aggr-] allow median for an even number of dates --- visidata/aggregators.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/visidata/aggregators.py b/visidata/aggregators.py index 92177567d..2f71cfd09 100644 --- a/visidata/aggregators.py +++ b/visidata/aggregators.py @@ -118,6 +118,16 @@ def mean(vals): def vsum(vals): return sum(vals, start=type(vals[0] if len(vals) else 0)()) #1996 +def median(vals): + if not vals: + return None + if type(vals[0]) is date: + # when the length is even, statistics.median needs to add + # two midpoints to average them, so convert to timestamps + vals = [d.timestamp() for d in vals] + return datetime.date.fromtimestamp(statistics.median(vals)) + return statistics.median(vals) + def stdev(vals): if vals and len(vals) >= 2: if type(vals[0]) is date: @@ -171,7 +181,7 @@ def quantiles(q, helpstr): vd.aggregator('max', max, 'maximum value') vd.aggregator('avg', mean, 'arithmetic mean of values', type=anytype) vd.aggregator('mean', mean, 'arithmetic mean of values', type=anytype) -vd.aggregator('median', statistics.median, 'median of values') +vd.aggregator('median', median, 'median of values') vd.aggregator('mode', statistics.mode, 'mode of values') vd.aggregator('sum', vsum, 'sum of values') vd.aggregator('distinct', set, 'distinct values', type=vlen) From 3c84c7ca5e434cb2cae663faa685370c6d20e01c Mon Sep 17 00:00:00 2001 From: midichef <67946319+midichef@users.noreply.github.com> Date: Sun, 25 Feb 2024 00:07:09 -0800 Subject: [PATCH 3/9] [aggr-] change sum type to anytype For a column of dates with no rows, the sum was shown as "Dec 31, 1969". --- visidata/aggregators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/visidata/aggregators.py b/visidata/aggregators.py index 2f71cfd09..198c4d2e9 100644 --- a/visidata/aggregators.py +++ b/visidata/aggregators.py @@ -183,7 +183,7 @@ def quantiles(q, helpstr): vd.aggregator('mean', mean, 'arithmetic mean of values', type=anytype) vd.aggregator('median', median, 'median of values') vd.aggregator('mode', statistics.mode, 'mode of values') -vd.aggregator('sum', vsum, 'sum of values') +vd.aggregator('sum', vsum, 'sum of values', type=anytype) vd.aggregator('distinct', set, 'distinct values', type=vlen) vd.aggregator('count', lambda values: sum(1 for v in values), 'number of values', type=int) vd.aggregator('list', list, 'list of values', type=anytype) From 1bab04f7bbab5e0304136936c9830349f8b7ce55 Mon Sep 17 00:00:00 2001 From: midichef <67946319+midichef@users.noreply.github.com> Date: Wed, 10 Apr 2024 23:12:39 -0700 Subject: [PATCH 4/9] [aggr-] clarify error for sum of dates --- visidata/aggregators.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/visidata/aggregators.py b/visidata/aggregators.py index 198c4d2e9..789b0a4be 100644 --- a/visidata/aggregators.py +++ b/visidata/aggregators.py @@ -116,7 +116,13 @@ def mean(vals): return float(sum(vals))/len(vals) def vsum(vals): - return sum(vals, start=type(vals[0] if len(vals) else 0)()) #1996 + if vals: + if type(vals[0]) is date: + vd.error('dates cannot be summed') + return None + return sum(vals, start=type(vals[0])()) #1996 + else: + return 0 def median(vals): if not vals: From fcbba9d6409d51ef6e71b5462f5ccbda75c059c7 Mon Sep 17 00:00:00 2001 From: midichef <67946319+midichef@users.noreply.github.com> Date: Wed, 10 Apr 2024 22:55:00 -0700 Subject: [PATCH 5/9] [describe-] use each aggregator's type for its column Fixes the display of mean of a column of dates. --- visidata/features/describe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/visidata/features/describe.py b/visidata/features/describe.py index be26311c8..83aaedf9c 100644 --- a/visidata/features/describe.py +++ b/visidata/features/describe.py @@ -1,7 +1,7 @@ from copy import copy from statistics import mode, median, mean, stdev -from visidata import vd, Column, ColumnAttr, vlen, RowColorizer, asyncthread, Progress, wrapply +from visidata import vd, Column, ColumnAttr, vlen, RowColorizer, asyncthread, Progress, wrapply, anytype from visidata import BaseSheet, TableSheet, ColumnsSheet, SheetsSheet @@ -61,7 +61,8 @@ def loader(self): self.resetCols() for aggrname in vd.options.describe_aggrs.split(): - self.addColumn(DescribeColumn(aggrname, type=float)) + aggrtype = vd.aggregators[aggrname].type + self.addColumn(DescribeColumn(aggrname, type=aggrtype)) for srccol in Progress(self.rows, 'categorizing'): if not srccol.hidden: From 9e97c8a4cd08bc83c943272780c17cc85a5547d3 Mon Sep 17 00:00:00 2001 From: midichef <67946319+midichef@users.noreply.github.com> Date: Wed, 10 Apr 2024 23:30:33 -0700 Subject: [PATCH 6/9] [describe-] use aggregators for min/max/sum/median The sum aggregator can sum timedeltas. The sum() function could not. --- visidata/features/describe.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/visidata/features/describe.py b/visidata/features/describe.py index 83aaedf9c..01bb098e7 100644 --- a/visidata/features/describe.py +++ b/visidata/features/describe.py @@ -1,11 +1,11 @@ from copy import copy -from statistics import mode, median, mean, stdev +from statistics import mode from visidata import vd, Column, ColumnAttr, vlen, RowColorizer, asyncthread, Progress, wrapply, anytype from visidata import BaseSheet, TableSheet, ColumnsSheet, SheetsSheet -vd.option('describe_aggrs', 'mean stdev', 'numeric aggregators to calculate on Describe sheet', help=vd.help_aggregators) +vd.option('describe_aggrs', 'min max sum median mean stdev', 'numeric aggregators to calculate on Describe sheet', help=vd.help_aggregators) @Column.api @@ -44,10 +44,6 @@ class DescribeSheet(ColumnsSheet): DescribeColumn('nulls', type=vlen), DescribeColumn('distinct',type=vlen), DescribeColumn('mode', type=str), - DescribeColumn('min', type=str), - DescribeColumn('max', type=str), - DescribeColumn('sum'), - DescribeColumn('median', type=str), ] colorizers = [ RowColorizer(7, 'color_key_col', lambda s,c,r,v: r and r in r.sheet.keyCols), @@ -91,8 +87,6 @@ def reloadColumn(self, srccol): d['mode'] = self.calcStatistic(d, mode, vals) if vd.isNumeric(srccol): - for func in [min, max, sum, median]: # use type - d[func.__name__] = self.calcStatistic(d, func, vals) for aggrname in vd.options.describe_aggrs.split(): aggr = vd.aggregators[aggrname].funcValues d[aggrname] = self.calcStatistic(d, aggr, vals) From 0aa07c412428a63417c83694337d7d78acf351a3 Mon Sep 17 00:00:00 2001 From: midichef <67946319+midichef@users.noreply.github.com> Date: Thu, 11 Apr 2024 20:07:10 -0700 Subject: [PATCH 7/9] [describe-] calculate stats only when type is appropriate --- visidata/features/describe.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/visidata/features/describe.py b/visidata/features/describe.py index 01bb098e7..1181cbf10 100644 --- a/visidata/features/describe.py +++ b/visidata/features/describe.py @@ -1,7 +1,8 @@ from copy import copy from statistics import mode +import datetime -from visidata import vd, Column, ColumnAttr, vlen, RowColorizer, asyncthread, Progress, wrapply, anytype +from visidata import vd, Column, ColumnAttr, vlen, RowColorizer, asyncthread, Progress, wrapply, anytype, date from visidata import BaseSheet, TableSheet, ColumnsSheet, SheetsSheet @@ -84,10 +85,15 @@ def reloadColumn(self, srccol): d['distinct'].add(v) except Exception as e: d['errors'].append(sr) + if not vals: + return d['mode'] = self.calcStatistic(d, mode, vals) - if vd.isNumeric(srccol): + if vd.isNumeric(srccol) or \ + isinstance(vals[0], (datetime.timedelta, datetime.date)): for aggrname in vd.options.describe_aggrs.split(): + if aggrname == 'sum' and (srccol.type is date or isinstance(vals[0], datetime.date)): + continue aggr = vd.aggregators[aggrname].funcValues d[aggrname] = self.calcStatistic(d, aggr, vals) From a842f7cd135cf3cd4d4c740d45f2689662dfe8ee Mon Sep 17 00:00:00 2001 From: midichef <67946319+midichef@users.noreply.github.com> Date: Thu, 11 Apr 2024 13:29:15 -0700 Subject: [PATCH 8/9] [aggr-] change type for min/max/median to anytype Fixes type-related errors when using the aggregators in DescribeSheet. --- visidata/aggregators.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/visidata/aggregators.py b/visidata/aggregators.py index 789b0a4be..f748de624 100644 --- a/visidata/aggregators.py +++ b/visidata/aggregators.py @@ -183,11 +183,11 @@ def quantiles(q, helpstr): return [PercentileAggregator(round(100*i/q), helpstr) for i in range(1, q)] -vd.aggregator('min', min, 'minimum value') -vd.aggregator('max', max, 'maximum value') +vd.aggregator('min', min, 'minimum value', type=anytype) +vd.aggregator('max', max, 'maximum value', type=anytype) vd.aggregator('avg', mean, 'arithmetic mean of values', type=anytype) vd.aggregator('mean', mean, 'arithmetic mean of values', type=anytype) -vd.aggregator('median', median, 'median of values') +vd.aggregator('median', median, 'median of values', type=anytype) vd.aggregator('mode', statistics.mode, 'mode of values') vd.aggregator('sum', vsum, 'sum of values', type=anytype) vd.aggregator('distinct', set, 'distinct values', type=vlen) From ca2fc92755605137d74e6129619476d448948307 Mon Sep 17 00:00:00 2001 From: midichef <67946319+midichef@users.noreply.github.com> Date: Thu, 11 Apr 2024 23:35:13 -0700 Subject: [PATCH 9/9] [aggr-] fail when memo-aggregate is called on no rows --- visidata/aggregators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/visidata/aggregators.py b/visidata/aggregators.py index f748de624..73312c9f5 100644 --- a/visidata/aggregators.py +++ b/visidata/aggregators.py @@ -273,6 +273,8 @@ def _aggregateTotalAsync(col, agg): @asyncthread def memo_aggregate(col, agg_choices, rows): 'Show aggregated value in status, and add to memory.' + if not rows: + vd.fail('no rows to aggregate') for agg_choice in agg_choices: agg = vd.aggregators.get(agg_choice) if not agg: continue