From e5982e1b4c72f1d2bded965d1d7db3379ff16790 Mon Sep 17 00:00:00 2001
From: midichef <67946319+midichef@users.noreply.github.com>
Date: Sat, 24 Feb 2024 18:35:04 -0800
Subject: [PATCH 1/9] [aggr-] allow mean and stdev for dates and timedeltas

---
 visidata/aggregators.py | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/visidata/aggregators.py b/visidata/aggregators.py
index 3b424a5ae..92177567d 100644
--- a/visidata/aggregators.py
+++ b/visidata/aggregators.py
@@ -4,6 +4,7 @@
 import collections
 import statistics
 from copy import copy
+import datetime
 
 from visidata import Progress, Sheet, Column, ColumnsSheet, VisiData
 from visidata import vd, anytype, vlen, asyncthread, wrapply, AttrDict, date, INPROGRESS
@@ -105,11 +106,31 @@ def aggregator(vd, name, funcValues, helpstr='', *, type=None):
 def mean(vals):
     vals = list(vals)
     if vals:
-        return float(sum(vals))/len(vals)
+        if type(vals[0]) is date:
+            vals = [d.timestamp() for d in vals]
+            ans = float(sum(vals))/len(vals)
+            return datetime.date.fromtimestamp(ans)
+        elif isinstance(vals[0], datetime.timedelta):
+            return datetime.timedelta(seconds=vsum(vals)/datetime.timedelta(seconds=len(vals)))
+        else:
+            return float(sum(vals))/len(vals)
 
 def vsum(vals):
     return sum(vals, start=type(vals[0] if len(vals) else 0)())  #1996
 
+def stdev(vals):
+    if vals and len(vals) >= 2:
+        if type(vals[0]) is date:
+            vals = [d.timestamp() for d in vals]
+            return datetime.timedelta(seconds=statistics.stdev(vals))
+        elif isinstance(vals[0], datetime.timedelta):
+            vals = [d.total_seconds() for d in vals]
+            return datetime.timedelta(seconds=statistics.stdev(vals))
+        return statistics.stdev(vals)
+    else:
+        vd.error('stdev requires at least two data points')
+        return None
+
 # http://code.activestate.com/recipes/511478-finding-the-percentile-of-the-values/
 def _percentile(N, percent, key=lambda x:x):
     """
@@ -148,15 +169,15 @@ def quantiles(q, helpstr):
 
 vd.aggregator('min', min, 'minimum value')
 vd.aggregator('max', max, 'maximum value')
-vd.aggregator('avg', mean, 'arithmetic mean of values', type=float)
-vd.aggregator('mean', mean, 'arithmetic mean of values', type=float)
+vd.aggregator('avg', mean, 'arithmetic mean of values', type=anytype)
+vd.aggregator('mean', mean, 'arithmetic mean of values', type=anytype)
 vd.aggregator('median', statistics.median, 'median of values')
 vd.aggregator('mode', statistics.mode, 'mode of values')
 vd.aggregator('sum', vsum, 'sum of values')
 vd.aggregator('distinct', set, 'distinct values', type=vlen)
 vd.aggregator('count', lambda values: sum(1 for v in values), 'number of values', type=int)
 vd.aggregator('list', list, 'list of values', type=anytype)
-vd.aggregator('stdev', statistics.stdev, 'standard deviation of values', type=float)
+vd.aggregator('stdev', stdev, 'standard deviation of values', type=anytype)
 
 vd.aggregators['q3'] = quantiles(3, 'tertiles (33/66th pctile)')
 vd.aggregators['q4'] = quantiles(4, 'quartiles (25/50/75th pctile)')
@@ -243,7 +264,11 @@ def memo_aggregate(col, agg_choices, rows):
         for agg in aggs:
             aggval = agg.aggregate(col, rows)
             typedval = wrapply(agg.type or col.type, aggval)
-            dispval = col.format(typedval)
+            if agg.name == 'stdev' and (col.type is date):
+                # col type is a date, but typedval is a timedelta
+                dispval = str(typedval)
+            else:
+                dispval = col.format(typedval)
             k = col.name+'_'+agg.name
             vd.status(f'{k}={dispval}')
             vd.memory[k] = typedval

From 4ec25f56d2eea7a74217f1b8407e8bb0eab7ec5b Mon Sep 17 00:00:00 2001
From: midichef <67946319+midichef@users.noreply.github.com>
Date: Sun, 14 Apr 2024 20:09:38 -0700
Subject: [PATCH 2/9] [aggr-] allow median for an even number of dates

---
 visidata/aggregators.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/visidata/aggregators.py b/visidata/aggregators.py
index 92177567d..2f71cfd09 100644
--- a/visidata/aggregators.py
+++ b/visidata/aggregators.py
@@ -118,6 +118,16 @@ def mean(vals):
 def vsum(vals):
     return sum(vals, start=type(vals[0] if len(vals) else 0)())  #1996
 
+def median(vals):
+    if not vals:
+        return None
+    if type(vals[0]) is date:
+        # when the length is even, statistics.median needs to add
+        # two midpoints to average them, so convert to timestamps
+        vals = [d.timestamp() for d in vals]
+        return datetime.date.fromtimestamp(statistics.median(vals))
+    return statistics.median(vals)
+
 def stdev(vals):
     if vals and len(vals) >= 2:
         if type(vals[0]) is date:
@@ -171,7 +181,7 @@ def quantiles(q, helpstr):
 vd.aggregator('max', max, 'maximum value')
 vd.aggregator('avg', mean, 'arithmetic mean of values', type=anytype)
 vd.aggregator('mean', mean, 'arithmetic mean of values', type=anytype)
-vd.aggregator('median', statistics.median, 'median of values')
+vd.aggregator('median', median, 'median of values')
 vd.aggregator('mode', statistics.mode, 'mode of values')
 vd.aggregator('sum', vsum, 'sum of values')
 vd.aggregator('distinct', set, 'distinct values', type=vlen)

From 3c84c7ca5e434cb2cae663faa685370c6d20e01c Mon Sep 17 00:00:00 2001
From: midichef <67946319+midichef@users.noreply.github.com>
Date: Sun, 25 Feb 2024 00:07:09 -0800
Subject: [PATCH 3/9] [aggr-] change sum type to anytype

For a column of dates with no rows, the sum was shown as "Dec 31, 1969".
---
 visidata/aggregators.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/visidata/aggregators.py b/visidata/aggregators.py
index 2f71cfd09..198c4d2e9 100644
--- a/visidata/aggregators.py
+++ b/visidata/aggregators.py
@@ -183,7 +183,7 @@ def quantiles(q, helpstr):
 vd.aggregator('mean', mean, 'arithmetic mean of values', type=anytype)
 vd.aggregator('median', median, 'median of values')
 vd.aggregator('mode', statistics.mode, 'mode of values')
-vd.aggregator('sum', vsum, 'sum of values')
+vd.aggregator('sum', vsum, 'sum of values', type=anytype)
 vd.aggregator('distinct', set, 'distinct values', type=vlen)
 vd.aggregator('count', lambda values: sum(1 for v in values), 'number of values', type=int)
 vd.aggregator('list', list, 'list of values', type=anytype)

From 1bab04f7bbab5e0304136936c9830349f8b7ce55 Mon Sep 17 00:00:00 2001
From: midichef <67946319+midichef@users.noreply.github.com>
Date: Wed, 10 Apr 2024 23:12:39 -0700
Subject: [PATCH 4/9] [aggr-] clarify error for sum of dates

---
 visidata/aggregators.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/visidata/aggregators.py b/visidata/aggregators.py
index 198c4d2e9..789b0a4be 100644
--- a/visidata/aggregators.py
+++ b/visidata/aggregators.py
@@ -116,7 +116,13 @@ def mean(vals):
             return float(sum(vals))/len(vals)
 
 def vsum(vals):
-    return sum(vals, start=type(vals[0] if len(vals) else 0)())  #1996
+    if vals:
+        if type(vals[0]) is date:
+            vd.error('dates cannot be summed')
+            return None
+        return sum(vals, start=type(vals[0])())  #1996
+    else:
+        return 0
 
 def median(vals):
     if not vals:

From fcbba9d6409d51ef6e71b5462f5ccbda75c059c7 Mon Sep 17 00:00:00 2001
From: midichef <67946319+midichef@users.noreply.github.com>
Date: Wed, 10 Apr 2024 22:55:00 -0700
Subject: [PATCH 5/9] [describe-] use each aggregator's type for its column

Fixes the display of mean of a column of dates.
---
 visidata/features/describe.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/visidata/features/describe.py b/visidata/features/describe.py
index be26311c8..83aaedf9c 100644
--- a/visidata/features/describe.py
+++ b/visidata/features/describe.py
@@ -1,7 +1,7 @@
 from copy import copy
 from statistics import mode, median, mean, stdev
 
-from visidata import vd, Column, ColumnAttr, vlen, RowColorizer, asyncthread, Progress, wrapply
+from visidata import vd, Column, ColumnAttr, vlen, RowColorizer, asyncthread, Progress, wrapply, anytype
 from visidata import BaseSheet, TableSheet, ColumnsSheet, SheetsSheet
 
 
@@ -61,7 +61,8 @@ def loader(self):
         self.resetCols()
 
         for aggrname in vd.options.describe_aggrs.split():
-            self.addColumn(DescribeColumn(aggrname, type=float))
+            aggrtype = vd.aggregators[aggrname].type
+            self.addColumn(DescribeColumn(aggrname, type=aggrtype))
 
         for srccol in Progress(self.rows, 'categorizing'):
             if not srccol.hidden:

From 9e97c8a4cd08bc83c943272780c17cc85a5547d3 Mon Sep 17 00:00:00 2001
From: midichef <67946319+midichef@users.noreply.github.com>
Date: Wed, 10 Apr 2024 23:30:33 -0700
Subject: [PATCH 6/9] [describe-] use aggregators for min/max/sum/median

The sum aggregator can sum timedeltas. The sum() function could not.
---
 visidata/features/describe.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/visidata/features/describe.py b/visidata/features/describe.py
index 83aaedf9c..01bb098e7 100644
--- a/visidata/features/describe.py
+++ b/visidata/features/describe.py
@@ -1,11 +1,11 @@
 from copy import copy
-from statistics import mode, median, mean, stdev
+from statistics import mode
 
 from visidata import vd, Column, ColumnAttr, vlen, RowColorizer, asyncthread, Progress, wrapply, anytype
 from visidata import BaseSheet, TableSheet, ColumnsSheet, SheetsSheet
 
 
-vd.option('describe_aggrs', 'mean stdev', 'numeric aggregators to calculate on Describe sheet', help=vd.help_aggregators)
+vd.option('describe_aggrs', 'min max sum median mean stdev', 'numeric aggregators to calculate on Describe sheet', help=vd.help_aggregators)
 
 
 @Column.api
@@ -44,10 +44,6 @@ class DescribeSheet(ColumnsSheet):
             DescribeColumn('nulls',  type=vlen),
             DescribeColumn('distinct',type=vlen),
             DescribeColumn('mode',   type=str),
-            DescribeColumn('min',    type=str),
-            DescribeColumn('max',    type=str),
-            DescribeColumn('sum'),
-            DescribeColumn('median', type=str),
     ]
     colorizers = [
         RowColorizer(7, 'color_key_col', lambda s,c,r,v: r and r in r.sheet.keyCols),
@@ -91,8 +87,6 @@ def reloadColumn(self, srccol):
 
             d['mode'] = self.calcStatistic(d, mode, vals)
             if vd.isNumeric(srccol):
-                for func in [min, max, sum, median]:  # use type
-                    d[func.__name__] = self.calcStatistic(d, func, vals)
                 for aggrname in vd.options.describe_aggrs.split():
                     aggr = vd.aggregators[aggrname].funcValues
                     d[aggrname] = self.calcStatistic(d, aggr, vals)

From 0aa07c412428a63417c83694337d7d78acf351a3 Mon Sep 17 00:00:00 2001
From: midichef <67946319+midichef@users.noreply.github.com>
Date: Thu, 11 Apr 2024 20:07:10 -0700
Subject: [PATCH 7/9] [describe-] calculate stats only when type is appropriate

---
 visidata/features/describe.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/visidata/features/describe.py b/visidata/features/describe.py
index 01bb098e7..1181cbf10 100644
--- a/visidata/features/describe.py
+++ b/visidata/features/describe.py
@@ -1,7 +1,8 @@
 from copy import copy
 from statistics import mode
+import datetime
 
-from visidata import vd, Column, ColumnAttr, vlen, RowColorizer, asyncthread, Progress, wrapply, anytype
+from visidata import vd, Column, ColumnAttr, vlen, RowColorizer, asyncthread, Progress, wrapply, anytype, date
 from visidata import BaseSheet, TableSheet, ColumnsSheet, SheetsSheet
 
 
@@ -84,10 +85,15 @@ def reloadColumn(self, srccol):
                     d['distinct'].add(v)
                 except Exception as e:
                     d['errors'].append(sr)
+            if not vals:
+                return
 
             d['mode'] = self.calcStatistic(d, mode, vals)
-            if vd.isNumeric(srccol):
+            if vd.isNumeric(srccol) or \
+               isinstance(vals[0], (datetime.timedelta, datetime.date)):
                 for aggrname in vd.options.describe_aggrs.split():
+                    if aggrname == 'sum' and (srccol.type is date or isinstance(vals[0], datetime.date)):
+                        continue
                     aggr = vd.aggregators[aggrname].funcValues
                     d[aggrname] = self.calcStatistic(d, aggr, vals)
 

From a842f7cd135cf3cd4d4c740d45f2689662dfe8ee Mon Sep 17 00:00:00 2001
From: midichef <67946319+midichef@users.noreply.github.com>
Date: Thu, 11 Apr 2024 13:29:15 -0700
Subject: [PATCH 8/9] [aggr-] change type for min/max/median to anytype

Fixes type-related errors when using the aggregators in DescribeSheet.
---
 visidata/aggregators.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/visidata/aggregators.py b/visidata/aggregators.py
index 789b0a4be..f748de624 100644
--- a/visidata/aggregators.py
+++ b/visidata/aggregators.py
@@ -183,11 +183,11 @@ def quantiles(q, helpstr):
     return [PercentileAggregator(round(100*i/q), helpstr) for i in range(1, q)]
 
 
-vd.aggregator('min', min, 'minimum value')
-vd.aggregator('max', max, 'maximum value')
+vd.aggregator('min', min, 'minimum value', type=anytype)
+vd.aggregator('max', max, 'maximum value', type=anytype)
 vd.aggregator('avg', mean, 'arithmetic mean of values', type=anytype)
 vd.aggregator('mean', mean, 'arithmetic mean of values', type=anytype)
-vd.aggregator('median', median, 'median of values')
+vd.aggregator('median', median, 'median of values', type=anytype)
 vd.aggregator('mode', statistics.mode, 'mode of values')
 vd.aggregator('sum', vsum, 'sum of values', type=anytype)
 vd.aggregator('distinct', set, 'distinct values', type=vlen)

From ca2fc92755605137d74e6129619476d448948307 Mon Sep 17 00:00:00 2001
From: midichef <67946319+midichef@users.noreply.github.com>
Date: Thu, 11 Apr 2024 23:35:13 -0700
Subject: [PATCH 9/9] [aggr-] fail when memo-aggregate is called on no rows

---
 visidata/aggregators.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/visidata/aggregators.py b/visidata/aggregators.py
index f748de624..73312c9f5 100644
--- a/visidata/aggregators.py
+++ b/visidata/aggregators.py
@@ -273,6 +273,8 @@ def _aggregateTotalAsync(col, agg):
 @asyncthread
 def memo_aggregate(col, agg_choices, rows):
     'Show aggregated value in status, and add to memory.'
+    if not rows:
+        vd.fail('no rows to aggregate')
     for agg_choice in agg_choices:
         agg = vd.aggregators.get(agg_choice)
         if not agg: continue