diff --git a/lib/polars/data_frame.rb b/lib/polars/data_frame.rb index 9866a420f0..42252b3a4b 100644 --- a/lib/polars/data_frame.rb +++ b/lib/polars/data_frame.rb @@ -1791,13 +1791,13 @@ def with_row_count(name: "row_nr", offset: 0) _from_rbdf(_df.with_row_count(name, offset)) end - # Start a groupby operation. + # Start a group by operation. # # @param by [Object] # Column(s) to group by. # @param maintain_order [Boolean] # Make sure that the order of the groups remain consistent. This is more - # expensive than a default groupby. Note that this only works in expression + # expensive than a default group by. Note that this only works in expression # aggregations. # # @return [GroupBy] @@ -1810,7 +1810,7 @@ def with_row_count(name: "row_nr", offset: 0) # "c" => [6, 5, 4, 3, 2, 1] # } # ) - # df.groupby("a").agg(Polars.col("b").sum).sort("a") + # df.group_by("a").agg(Polars.col("b").sum).sort("a") # # => # # shape: (3, 2) # # ┌─────┬─────┐ @@ -1824,7 +1824,7 @@ def with_row_count(name: "row_nr", offset: 0) # # └─────┴─────┘ def group_by(by, maintain_order: false) if !Utils.bool?(maintain_order) - raise TypeError, "invalid input for groupby arg `maintain_order`: #{maintain_order}." + raise TypeError, "invalid input for group_by arg `maintain_order`: #{maintain_order}." end GroupBy.new( self, @@ -1839,9 +1839,9 @@ def group_by(by, maintain_order: false) # # Also works for index values of type `:i32` or `:i64`. # - # Different from a `dynamic_groupby` the windows are now determined by the + # Different from a `dynamic_group_by` the windows are now determined by the # individual values and are not of constant intervals. For constant intervals use - # *groupby_dynamic* + # *group_by_dynamic* # # The `period` and `offset` arguments are created either from a timedelta, or # by using the following string language: @@ -1861,7 +1861,7 @@ def group_by(by, maintain_order: false) # Or combine them: # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds # - # In case of a groupby_rolling on an integer column, the windows are defined by: + # In case of a group_by_rolling on an integer column, the windows are defined by: # # - **"1i" # length 1** # - **"10i" # length 10** @@ -1872,7 +1872,7 @@ def group_by(by, maintain_order: false) # This column must be sorted in ascending order. If not the output will not # make sense. # - # In case of a rolling groupby on indices, dtype needs to be one of + # In case of a rolling group by on indices, dtype needs to be one of # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if # performance matters use an `:i64` column. # @param period [Object] @@ -1904,7 +1904,7 @@ def group_by(by, maintain_order: false) # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column( # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted # ) - # df.groupby_rolling(index_column: "dt", period: "2d").agg( + # df.group_by_rolling(index_column: "dt", period: "2d").agg( # [ # Polars.sum("a").alias("sum_a"), # Polars.min("a").alias("min_a"), @@ -1940,7 +1940,7 @@ def group_by_rolling( # Group based on a time value (or index value of type `:i32`, `:i64`). # # Time windows are calculated and rows are assigned to windows. Different from a - # normal groupby is that a row can be member of multiple groups. The time/index + # normal group by is that a row can be member of multiple groups. The time/index # window could be seen as a rolling window, with a window size determined by # dates/times/values instead of slots in the DataFrame. # @@ -1968,7 +1968,7 @@ def group_by_rolling( # Or combine them: # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds # - # In case of a groupby_dynamic on an integer column, the windows are defined by: + # In case of a group_by_dynamic on an integer column, the windows are defined by: # # - "1i" # length 1 # - "10i" # length 10 @@ -1979,7 +1979,7 @@ def group_by_rolling( # This column must be sorted in ascending order. If not the output will not # make sense. # - # In case of a dynamic groupby on indices, dtype needs to be one of + # In case of a dynamic group by on indices, dtype needs to be one of # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if # performance matters use an `:i64` column. # @param every @@ -2030,7 +2030,7 @@ def group_by_rolling( # # └─────────────────────┴─────┘ # # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00. - # df.groupby_dynamic("time", every: "1h", closed: "right").agg( + # df.group_by_dynamic("time", every: "1h", closed: "right").agg( # [ # Polars.col("time").min.alias("time_min"), # Polars.col("time").max.alias("time_max") @@ -2050,7 +2050,7 @@ def group_by_rolling( # # └─────────────────────┴─────────────────────┴─────────────────────┘ # # @example The window boundaries can also be added to the aggregation result. - # df.groupby_dynamic( + # df.group_by_dynamic( # "time", every: "1h", include_boundaries: true, closed: "right" # ).agg([Polars.col("time").count.alias("time_count")]) # # => @@ -2067,7 +2067,7 @@ def group_by_rolling( # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ # # @example When closed="left", should not include right end of interval. - # df.groupby_dynamic("time", every: "1h", closed: "left").agg( + # df.group_by_dynamic("time", every: "1h", closed: "left").agg( # [ # Polars.col("time").count.alias("time_count"), # Polars.col("time").alias("time_agg_list") @@ -2087,7 +2087,7 @@ def group_by_rolling( # # └─────────────────────┴────────────┴───────────────────────────────────┘ # # @example When closed="both" the time values at the window boundaries belong to 2 groups. - # df.groupby_dynamic("time", every: "1h", closed: "both").agg( + # df.group_by_dynamic("time", every: "1h", closed: "both").agg( # [Polars.col("time").count.alias("time_count")] # ) # # => @@ -2104,7 +2104,7 @@ def group_by_rolling( # # │ 2021-12-16 03:00:00 ┆ 1 │ # # └─────────────────────┴────────────┘ # - # @example Dynamic groupbys can also be combined with grouping on normal keys. + # @example Dynamic group bys can also be combined with grouping on normal keys. # df = Polars::DataFrame.new( # { # "time" => Polars.date_range( @@ -2115,7 +2115,7 @@ def group_by_rolling( # "groups" => ["a", "a", "a", "b", "b", "a", "a"] # } # ) - # df.groupby_dynamic( + # df.group_by_dynamic( # "time", # every: "1h", # closed: "both", @@ -2138,14 +2138,14 @@ def group_by_rolling( # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ # - # @example Dynamic groupby on an index column. + # @example Dynamic group by on an index column. # df = Polars::DataFrame.new( # { # "idx" => Polars.arange(0, 6, eager: true), # "A" => ["A", "A", "B", "B", "B", "C"] # } # ) - # df.groupby_dynamic( + # df.group_by_dynamic( # "idx", # every: "2i", # period: "3i", diff --git a/lib/polars/dynamic_group_by.rb b/lib/polars/dynamic_group_by.rb index 081c7cc89b..cbe46af522 100644 --- a/lib/polars/dynamic_group_by.rb +++ b/lib/polars/dynamic_group_by.rb @@ -2,7 +2,7 @@ module Polars # A dynamic grouper. # # This has an `.agg` method which allows you to run all polars expressions in a - # groupby context. + # group by context. class DynamicGroupBy def initialize( df, diff --git a/lib/polars/expr.rb b/lib/polars/expr.rb index 0622bc18ba..5e68cfe100 100644 --- a/lib/polars/expr.rb +++ b/lib/polars/expr.rb @@ -689,7 +689,7 @@ def is_not_nan # "value" => [94, 95, 96, 97, 97, 99] # } # ) - # df.groupby("group", maintain_order: true).agg(Polars.col("value").agg_groups) + # df.group_by("group", maintain_order: true).agg(Polars.col("value").agg_groups) # # => # # shape: (2, 2) # # ┌───────┬───────────┐ @@ -1236,7 +1236,7 @@ def cast(dtype, strict: true) # Sort this column. In projection/ selection context the whole column is sorted. # - # If used in a groupby context, the groups are sorted. + # If used in a group by context, the groups are sorted. # # @param reverse [Boolean] # false -> order from small to large. @@ -1294,7 +1294,7 @@ def cast(dtype, strict: true) # # └───────┘ # # @example - # df.groupby("group").agg(Polars.col("value").sort) + # df.group_by("group").agg(Polars.col("value").sort) # # => # # shape: (2, 2) # # ┌───────┬────────────┐ @@ -1503,7 +1503,7 @@ def search_sorted(element, side: "any") # Sort this column by the ordering of another column, or multiple other columns. # # In projection/ selection context the whole column is sorted. - # If used in a groupby context, the groups are sorted. + # If used in a group by context, the groups are sorted. # # @param by [Object] # The column(s) used for sorting. @@ -2210,7 +2210,7 @@ def last # Apply window function over a subgroup. # - # This is similar to a groupby + aggregation + self join. + # This is similar to a group by + aggregation + self join. # Or similar to [window functions in Postgres](https://www.postgresql.org/docs/current/tutorial-window.html). # # @param expr [Object] @@ -2485,7 +2485,7 @@ def quantile(quantile, interpolation: "nearest") # } # ) # ( - # df.groupby("group_col").agg( + # df.group_by("group_col").agg( # [ # Polars.col("b").filter(Polars.col("b") < 2).sum.alias("lt"), # Polars.col("b").filter(Polars.col("b") >= 2).sum.alias("gte") @@ -2523,7 +2523,7 @@ def filter(predicate) # } # ) # ( - # df.groupby("group_col").agg( + # df.group_by("group_col").agg( # [ # Polars.col("b").where(Polars.col("b") < 2).sum.alias("lt"), # Polars.col("b").where(Polars.col("b") >= 2).sum.alias("gte") @@ -2641,7 +2641,7 @@ def where(predicate) # # @example In a GroupBy context the function is applied by group: # df.lazy - # .groupby("b", maintain_order: true) + # .group_by("b", maintain_order: true) # .agg( # [ # Polars.col("a").apply { |x| x.sum } @@ -2680,7 +2680,7 @@ def where(predicate) # "values" => [[1, 2], [2, 3], [4]] # } # ) - # df.groupby("group").agg(Polars.col("values").flatten) + # df.group_by("group").agg(Polars.col("values").flatten) # # => # # shape: (2, 2) # # ┌───────┬───────────┐ @@ -3170,7 +3170,7 @@ def interpolate(method: "linear") # # @note # If you want to compute multiple aggregation statistics over the same dynamic - # window, consider using `groupby_rolling` this method can cache the window size + # window, consider using `group_by_rolling` this method can cache the window size # computation. # # @return [Expr] @@ -3259,7 +3259,7 @@ def rolling_min( # # @note # If you want to compute multiple aggregation statistics over the same dynamic - # window, consider using `groupby_rolling` this method can cache the window size + # window, consider using `group_by_rolling` this method can cache the window size # computation. # # @return [Expr] @@ -3348,7 +3348,7 @@ def rolling_max( # # @note # If you want to compute multiple aggregation statistics over the same dynamic - # window, consider using `groupby_rolling` this method can cache the window size + # window, consider using `group_by_rolling` this method can cache the window size # computation. # # @return [Expr] @@ -3437,7 +3437,7 @@ def rolling_mean( # # @note # If you want to compute multiple aggregation statistics over the same dynamic - # window, consider using `groupby_rolling` this method can cache the window size + # window, consider using `group_by_rolling` this method can cache the window size # computation. # # @return [Expr] @@ -3526,7 +3526,7 @@ def rolling_sum( # # @note # If you want to compute multiple aggregation statistics over the same dynamic - # window, consider using `groupby_rolling` this method can cache the window size + # window, consider using `group_by_rolling` this method can cache the window size # computation. # # @return [Expr] @@ -3616,7 +3616,7 @@ def rolling_std( # # @note # If you want to compute multiple aggregation statistics over the same dynamic - # window, consider using `groupby_rolling` this method can cache the window size + # window, consider using `group_by_rolling` this method can cache the window size # computation. # # @return [Expr] @@ -3702,7 +3702,7 @@ def rolling_var( # # @note # If you want to compute multiple aggregation statistics over the same dynamic - # window, consider using `groupby_rolling` this method can cache the window size + # window, consider using `group_by_rolling` this method can cache the window size # computation. # # @return [Expr] @@ -3791,7 +3791,7 @@ def rolling_median( # # @note # If you want to compute multiple aggregation statistics over the same dynamic - # window, consider using `groupby_rolling` this method can cache the window size + # window, consider using `group_by_rolling` this method can cache the window size # computation. # # @return [Expr] @@ -4949,7 +4949,7 @@ def entropy(base: 2, normalize: true) # Number of valid values there should be in the window before the expression # is evaluated. valid values = `length - null_count` # @param parallel [Boolean] - # Run in parallel. Don't do this in a groupby or another operation that + # Run in parallel. Don't do this in a group by or another operation that # already has much parallelization. # # @return [Expr] diff --git a/lib/polars/group_by.rb b/lib/polars/group_by.rb index c3d49e27e8..2369672be5 100644 --- a/lib/polars/group_by.rb +++ b/lib/polars/group_by.rb @@ -8,13 +8,13 @@ def initialize(df, by, maintain_order: false) @maintain_order = maintain_order end - # Allows iteration over the groups of the groupby operation. + # Allows iteration over the groups of the group by operation. # # @return [Object] # # @example # df = Polars::DataFrame.new({"foo" => ["a", "a", "b"], "bar" => [1, 2, 3]}) - # df.groupby("foo", maintain_order: true).each.to_h + # df.group_by("foo", maintain_order: true).each.to_h # # => # # {"a"=>shape: (2, 2) # # ┌─────┬─────┐ @@ -89,7 +89,7 @@ def each # "shape" => ["square", "triangle", "square", "triangle", "square"] # } # ) - # df.groupby("color").apply { |group_df| group_df.sample(2) } + # df.group_by("color").apply { |group_df| group_df.sample(2) } # # => # # shape: (4, 3) # # ┌─────┬───────┬──────────┐ @@ -103,7 +103,7 @@ def each # # │ 3 ┆ red ┆ triangle │ # # └─────┴───────┴──────────┘ # def apply(&f) - # _dataframe_class._from_rbdf(_df.groupby_apply(by, f)) + # _dataframe_class._from_rbdf(_df.group_by_apply(by, f)) # end # Use multiple aggregations on columns. @@ -119,7 +119,7 @@ def each # df = Polars::DataFrame.new( # {"foo" => ["one", "two", "two", "one", "two"], "bar" => [5, 3, 2, 4, 1]} # ) - # df.groupby("foo", maintain_order: true).agg( + # df.group_by("foo", maintain_order: true).agg( # [ # Polars.sum("bar").suffix("_sum"), # Polars.col("bar").sort.tail(2).sum.suffix("_tail_sum") @@ -172,7 +172,7 @@ def agg(aggs) # # └─────────┴─────┘ # # @example - # df.groupby("letters").head(2).sort("letters") + # df.group_by("letters").head(2).sort("letters") # # => # # shape: (5, 2) # # ┌─────────┬─────┐ @@ -223,7 +223,7 @@ def head(n = 5) # # └─────────┴─────┘ # # @example - # df.groupby("letters").tail(2).sort("letters") + # df.group_by("letters").tail(2).sort("letters") # # => # # shape: (5, 2) # # ┌─────────┬─────┐ @@ -257,7 +257,7 @@ def tail(n = 5) # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"] # } # ) - # df.groupby("d", maintain_order: true).first + # df.group_by("d", maintain_order: true).first # # => # # shape: (3, 4) # # ┌────────┬─────┬──────┬───────┐ @@ -286,7 +286,7 @@ def first # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"] # } # ) - # df.groupby("d", maintain_order: true).last + # df.group_by("d", maintain_order: true).last # # => # # shape: (3, 4) # # ┌────────┬─────┬──────┬───────┐ @@ -315,7 +315,7 @@ def last # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"] # } # ) - # df.groupby("d", maintain_order: true).sum + # df.group_by("d", maintain_order: true).sum # # => # # shape: (3, 4) # # ┌────────┬─────┬──────┬─────┐ @@ -344,7 +344,7 @@ def sum # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"], # } # ) - # df.groupby("d", maintain_order: true).min + # df.group_by("d", maintain_order: true).min # # => # # shape: (3, 4) # # ┌────────┬─────┬──────┬───────┐ @@ -373,7 +373,7 @@ def min # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"] # } # ) - # df.groupby("d", maintain_order: true).max + # df.group_by("d", maintain_order: true).max # # => # # shape: (3, 4) # # ┌────────┬─────┬──────┬──────┐ @@ -402,7 +402,7 @@ def max # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"] # } # ) - # df.groupby("d", maintain_order: true).count + # df.group_by("d", maintain_order: true).count # # => # # shape: (3, 2) # # ┌────────┬───────┐ @@ -431,7 +431,7 @@ def count # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"] # } # ) - # df.groupby("d", maintain_order: true).mean + # df.group_by("d", maintain_order: true).mean # # => # # shape: (3, 4) # # ┌────────┬─────┬──────────┬──────────┐ @@ -459,7 +459,7 @@ def mean # "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"] # } # ) - # df.groupby("d", maintain_order: true).n_unique + # df.group_by("d", maintain_order: true).n_unique # # => # # shape: (2, 3) # # ┌────────┬─────┬─────┐ @@ -491,7 +491,7 @@ def n_unique # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"] # } # ) - # df.groupby("d", maintain_order: true).quantile(1) + # df.group_by("d", maintain_order: true).quantile(1) # # => # # shape: (3, 3) # # ┌────────┬─────┬──────┐ @@ -519,7 +519,7 @@ def quantile(quantile, interpolation: "nearest") # "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"] # } # ) - # df.groupby("d", maintain_order: true).median + # df.group_by("d", maintain_order: true).median # # => # # shape: (2, 3) # # ┌────────┬─────┬──────┐ diff --git a/lib/polars/lazy_frame.rb b/lib/polars/lazy_frame.rb index 22e4d52f3e..bea9b5cef3 100644 --- a/lib/polars/lazy_frame.rb +++ b/lib/polars/lazy_frame.rb @@ -447,7 +447,7 @@ def sort(by, reverse: false, nulls_last: false, maintain_order: false) # "c" => [6, 5, 4, 3, 2, 1] # } # ).lazy - # df.groupby("a", maintain_order: true).agg(Polars.all.sum).collect + # df.group_by("a", maintain_order: true).agg(Polars.all.sum).collect # # => # # shape: (3, 3) # # ┌─────┬─────┬─────┐ @@ -625,7 +625,7 @@ def sink_parquet( # "c" => [6, 5, 4, 3, 2, 1] # } # ).lazy - # df.groupby("a", maintain_order: true).agg(Polars.all.sum).fetch(2) + # df.group_by("a", maintain_order: true).agg(Polars.all.sum).fetch(2) # # => # # shape: (2, 3) # # ┌─────┬─────┬─────┐ @@ -855,13 +855,13 @@ def select(exprs) _from_rbldf(_ldf.select(exprs)) end - # Start a groupby operation. + # Start a group by operation. # # @param by [Object] # Column(s) to group by. # @param maintain_order [Boolean] # Make sure that the order of the groups remain consistent. This is more - # expensive than a default groupby. + # expensive than a default group by. # # @return [LazyGroupBy] # @@ -873,7 +873,7 @@ def select(exprs) # "c" => [6, 5, 4, 3, 2, 1] # } # ).lazy - # df.groupby("a", maintain_order: true).agg(Polars.col("b").sum).collect + # df.group_by("a", maintain_order: true).agg(Polars.col("b").sum).collect # # => # # shape: (3, 2) # # ┌─────┬─────┐ @@ -897,9 +897,9 @@ def group_by(by, maintain_order: false) # # Also works for index values of type `:i32` or `:i64`. # - # Different from a `dynamic_groupby` the windows are now determined by the + # Different from a `dynamic_group_by` the windows are now determined by the # individual values and are not of constant intervals. For constant intervals - # use *groupby_dynamic*. + # use *group_by_dynamic*. # # The `period` and `offset` arguments are created either from a timedelta, or # by using the following string language: @@ -919,7 +919,7 @@ def group_by(by, maintain_order: false) # Or combine them: # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds # - # In case of a groupby_rolling on an integer column, the windows are defined by: + # In case of a group_by_rolling on an integer column, the windows are defined by: # # - "1i" # length 1 # - "10i" # length 10 @@ -930,7 +930,7 @@ def group_by(by, maintain_order: false) # This column must be sorted in ascending order. If not the output will not # make sense. # - # In case of a rolling groupby on indices, dtype needs to be one of + # In case of a rolling group by on indices, dtype needs to be one of # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if # performance matters use an `:i64` column. # @param period [Object] @@ -962,7 +962,7 @@ def group_by(by, maintain_order: false) # df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column( # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted # ) - # df.groupby_rolling(index_column: "dt", period: "2d").agg( + # df.group_by_rolling(index_column: "dt", period: "2d").agg( # [ # Polars.sum("a").alias("sum_a"), # Polars.min("a").alias("min_a"), @@ -1010,7 +1010,7 @@ def group_by_rolling( # Group based on a time value (or index value of type `:i32`, `:i64`). # # Time windows are calculated and rows are assigned to windows. Different from a - # normal groupby is that a row can be member of multiple groups. The time/index + # normal group by is that a row can be member of multiple groups. The time/index # window could be seen as a rolling window, with a window size determined by # dates/times/values instead of slots in the DataFrame. # @@ -1038,7 +1038,7 @@ def group_by_rolling( # Or combine them: # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds # - # In case of a groupby_dynamic on an integer column, the windows are defined by: + # In case of a group_by_dynamic on an integer column, the windows are defined by: # # - "1i" # length 1 # - "10i" # length 10 @@ -1049,7 +1049,7 @@ def group_by_rolling( # This column must be sorted in ascending order. If not the output will not # make sense. # - # In case of a dynamic groupby on indices, dtype needs to be one of + # In case of a dynamic group by on indices, dtype needs to be one of # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if # performance matters use an `:i64` column. # @param every [Object] @@ -1106,7 +1106,7 @@ def group_by_rolling( # # └─────────────────────┴─────┘ # # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00. - # df.groupby_dynamic("time", every: "1h", closed: "right").agg( + # df.group_by_dynamic("time", every: "1h", closed: "right").agg( # [ # Polars.col("time").min.alias("time_min"), # Polars.col("time").max.alias("time_max") @@ -1126,7 +1126,7 @@ def group_by_rolling( # # └─────────────────────┴─────────────────────┴─────────────────────┘ # # @example The window boundaries can also be added to the aggregation result. - # df.groupby_dynamic( + # df.group_by_dynamic( # "time", every: "1h", include_boundaries: true, closed: "right" # ).agg([Polars.col("time").count.alias("time_count")]) # # => @@ -1143,7 +1143,7 @@ def group_by_rolling( # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ # # @example When closed="left", should not include right end of interval. - # df.groupby_dynamic("time", every: "1h", closed: "left").agg( + # df.group_by_dynamic("time", every: "1h", closed: "left").agg( # [ # Polars.col("time").count.alias("time_count"), # Polars.col("time").alias("time_agg_list") @@ -1163,7 +1163,7 @@ def group_by_rolling( # # └─────────────────────┴────────────┴───────────────────────────────────┘ # # @example When closed="both" the time values at the window boundaries belong to 2 groups. - # df.groupby_dynamic("time", every: "1h", closed: "both").agg( + # df.group_by_dynamic("time", every: "1h", closed: "both").agg( # [Polars.col("time").count.alias("time_count")] # ) # # => @@ -1180,7 +1180,7 @@ def group_by_rolling( # # │ 2021-12-16 03:00:00 ┆ 1 │ # # └─────────────────────┴────────────┘ # - # @example Dynamic groupbys can also be combined with grouping on normal keys. + # @example Dynamic group bys can also be combined with grouping on normal keys. # df = Polars::DataFrame.new( # { # "time" => Polars.date_range( @@ -1191,7 +1191,7 @@ def group_by_rolling( # "groups" => ["a", "a", "a", "b", "b", "a", "a"] # } # ) - # df.groupby_dynamic( + # df.group_by_dynamic( # "time", # every: "1h", # closed: "both", @@ -1214,14 +1214,14 @@ def group_by_rolling( # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ # - # @example Dynamic groupby on an index column. + # @example Dynamic group by on an index column. # df = Polars::DataFrame.new( # { # "idx" => Polars.arange(0, 6, eager: true), # "A" => ["A", "A", "B", "B", "B", "C"] # } # ) - # df.groupby_dynamic( + # df.group_by_dynamic( # "idx", # every: "2i", # period: "3i", diff --git a/lib/polars/lazy_group_by.rb b/lib/polars/lazy_group_by.rb index bbac1ef133..9c7b7ea1fa 100644 --- a/lib/polars/lazy_group_by.rb +++ b/lib/polars/lazy_group_by.rb @@ -1,5 +1,5 @@ module Polars - # Created by `df.lazy.groupby("foo")`. + # Created by `df.lazy.group_by("foo")`. class LazyGroupBy # @private def initialize(lgb) @@ -28,7 +28,7 @@ def agg(aggs) # "nrs" => [1, 2, 3, 4, 5, 6] # } # ) - # df.groupby("letters").head(2).sort("letters") + # df.group_by("letters").head(2).sort("letters") # # => # # shape: (5, 2) # # ┌─────────┬─────┐ @@ -60,7 +60,7 @@ def head(n = 5) # "nrs" => [1, 2, 3, 4, 5, 6] # } # ) - # df.groupby("letters").tail(2).sort("letters") + # df.group_by("letters").tail(2).sort("letters") # # => # # shape: (5, 2) # # ┌─────────┬─────┐ diff --git a/lib/polars/list_expr.rb b/lib/polars/list_expr.rb index 0fa8b1edb9..2e1d392ecd 100644 --- a/lib/polars/list_expr.rb +++ b/lib/polars/list_expr.rb @@ -613,7 +613,7 @@ def to_struct(n_field_strategy: "first_non_null", name_generator: nil) # Run all expression parallel. Don't activate this blindly. # Parallelism is worth it if there is enough work to do per thread. # - # This likely should not be use in the groupby context, because we already + # This likely should not be use in the group by context, because we already # parallel execution per group # # @return [Expr] diff --git a/lib/polars/list_name_space.rb b/lib/polars/list_name_space.rb index f567d9805e..7e1945f242 100644 --- a/lib/polars/list_name_space.rb +++ b/lib/polars/list_name_space.rb @@ -315,7 +315,7 @@ def to_struct(n_field_strategy: "first_non_null", name_generator: nil) # Run all expression parallel. Don't activate this blindly. # Parallelism is worth it if there is enough work to do per thread. # - # This likely should not be use in the groupby context, because we already + # This likely should not be use in the group by context, because we already # parallel execution per group # # @return [Series] diff --git a/lib/polars/rolling_group_by.rb b/lib/polars/rolling_group_by.rb index 761cb171fd..5360fae578 100644 --- a/lib/polars/rolling_group_by.rb +++ b/lib/polars/rolling_group_by.rb @@ -2,7 +2,7 @@ module Polars # A rolling grouper. # # This has an `.agg` method which will allow you to run all polars expressions in a - # groupby context. + # group by context. class RollingGroupBy def initialize( df, diff --git a/lib/polars/series.rb b/lib/polars/series.rb index ba9e720cc4..ed6691afbb 100644 --- a/lib/polars/series.rb +++ b/lib/polars/series.rb @@ -811,7 +811,7 @@ def entropy(base: Math::E, normalize: false) # Number of valid values there should be in the window before the expression # is evaluated. valid values = `length - null_count` # @param parallel [Boolean] - # Run in parallel. Don't do this in a groupby or another operation that + # Run in parallel. Don't do this in a group by or another operation that # already has much parallelization. # # @return [Series] diff --git a/test/data_frame_test.rb b/test/data_frame_test.rb index 2a15c46680..f5d4bce96e 100644 --- a/test/data_frame_test.rb +++ b/test/data_frame_test.rb @@ -366,12 +366,14 @@ def test_tail assert_series [18, 19, 20], df.tail(3)["a"] end - def test_groupby + def test_group_by df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => ["one", "two", "three"]}) + df.group_by("a").count + df.group_by(:a).count + df.group_by(["a", "b"]).count + df.group_by([:a, :b]).count df.groupby("a").count - df.groupby(:a).count - df.groupby(["a", "b"]).count - df.groupby([:a, :b]).count + df.group("a").count end def test_join diff --git a/test/guide_test.rb b/test/guide_test.rb index 0b5bfc7dba..4882c8f857 100644 --- a/test/guide_test.rb +++ b/test/guide_test.rb @@ -26,13 +26,13 @@ def test_readme def test_quickstart Polars.read_csv("test/support/iris.csv") .filter(Polars.col("sepal_length") > 5) - .groupby("species") + .group_by("species") .agg(Polars.all.sum) Polars.read_csv("test/support/iris.csv") .lazy .filter(Polars.col("sepal_length") > 5) - .groupby("species") + .group_by("species") .agg(Polars.all.sum) .collect end @@ -99,7 +99,7 @@ def test_contexts Polars.col("random").count.alias("count") ]) - output df.groupby("groups").agg([ + output df.group_by("groups").agg([ Polars.sum("nrs"), Polars.col("random").count.alias("count"), Polars.col("random").filter(Polars.col("names").is_not_null).sum.suffix("_sum"), @@ -199,7 +199,7 @@ def test_examples # Polars.fold(Polars.col("A"), ->(a, b) { a + "-" + b }, Polars.all.exclude("A")).alias("str_concat_2") # ]) - output df.sort("cars").groupby("fruits") + output df.sort("cars").group_by("fruits") .agg([ Polars.col("B").sum.alias("B_sum"), Polars.sum("B").alias("B_sum2"), @@ -208,7 +208,7 @@ def test_examples Polars.col("cars").reverse ]) - output df.sort("cars").groupby("fruits") + output df.sort("cars").group_by("fruits") .agg([ Polars.col("B").sum.alias("B_sum"), Polars.sum("B").alias("B_sum2"), @@ -217,7 +217,7 @@ def test_examples Polars.col("cars").reverse ]).explode("cars") - output df.groupby("fruits") + output df.group_by("fruits") .agg([ Polars.col("B").sum.alias("B_sum"), Polars.sum("B").alias("B_sum2"), @@ -226,7 +226,7 @@ def test_examples Polars.col("B").shift.alias("B_shifted") ]).explode("B_shifted") - output df.sort("cars").groupby("fruits") + output df.sort("cars").group_by("fruits") .agg([ Polars.col("B").sum, Polars.sum("B").alias("B_sum2"), @@ -235,23 +235,23 @@ def test_examples Polars.col("cars").reverse ]).explode("cars") - output df.groupby("fruits") + output df.group_by("fruits") .agg([ Polars.col("B").shift.alias("shift_B"), Polars.col("B").reverse.alias("rev_B") ]) - output df.groupby("fruits") + output df.group_by("fruits") .agg([ Polars.col("B").filter(Polars.col("B") > 1).implode.keep_name, ]) - output df.groupby("fruits") + output df.group_by("fruits") .agg([ Polars.col("B").filter(Polars.col("B") > 1).mean, ]) - output df.groupby("fruits") + output df.group_by("fruits") .agg([ Polars.col("B").shift_and_fill(1, 0).alias("shifted"), Polars.col("B").shift_and_fill(1, 0).sum.alias("shifted_sum") diff --git a/test/plot_test.rb b/test/plot_test.rb index 9c14f117ea..aa1184cc54 100644 --- a/test/plot_test.rb +++ b/test/plot_test.rb @@ -59,17 +59,17 @@ def test_group_option_pie def test_group_method df = Polars::DataFrame.new({"a" => ["one", "two", "three"], "b" => [1, 2, 3], "c" => ["group1", "group1", "group2"]}) - assert_group df.groupby("c").plot("a", "b", type: "line") - assert_group df.groupby("c").plot("a", "b", type: "column") - assert_group df.groupby("c").plot("a", "b", type: "bar") - assert_group df.groupby("c").plot("a", "b", type: "area") - assert_group df.groupby("c").plot("b", "b", type: "scatter") + assert_group df.group_by("c").plot("a", "b", type: "line") + assert_group df.group_by("c").plot("a", "b", type: "column") + assert_group df.group_by("c").plot("a", "b", type: "bar") + assert_group df.group_by("c").plot("a", "b", type: "area") + assert_group df.group_by("c").plot("b", "b", type: "scatter") end def test_group_method_multiple_columns df = Polars::DataFrame.new({"a" => ["one", "two", "three"], "b" => [1, 2, 3], "c" => ["group1", "group1", "group2"]}) error = assert_raises(ArgumentError) do - df.groupby(["c", "c"]).plot("a", "b") + df.group_by(["c", "c"]).plot("a", "b") end assert_equal "Multiple groups not supported", error.message end @@ -77,7 +77,7 @@ def test_group_method_multiple_columns def test_group_method_group_option df = Polars::DataFrame.new({"a" => ["one", "two", "three"], "b" => [1, 2, 3], "c" => ["group1", "group1", "group2"]}) error = assert_raises(ArgumentError) do - df.groupby("c").plot("a", "b", group: "c") + df.group_by("c").plot("a", "b", group: "c") end assert_equal "unknown keyword: :group", error.message end