Skip to content

group_by, ungroup

group_by(__data, *args, *, add=False, **kwargs)

Return a grouped DataFrame, using columns or expressions to define groups.

Any operations (e.g. summarize, mutate, filter) performed on grouped data will be performed "by group". Use ungroup() to remove the groupings.

Parameters:

Name Type Description Default
__data

The data being grouped.

required
*args

Lazy expressions used to select the grouping columns. Currently, each arg must refer to a single columns (e.g. .cyl, .mpg).

()
add bool

If the data is already grouped, whether to add these groupings on top of those.

False
**kwargs

Keyword arguments define new columns used to group the data.

{}

Examples:

>>> from siuba import _, group_by, summarize, filter, mutate, head
>>> from siuba.data import cars
>>> by_cyl = cars >> group_by(_.cyl)
>>> by_cyl >> summarize(max_mpg = _.mpg.max(), max_hp = _.hp.max())
   cyl  max_mpg  max_hp
0    4     33.9     113
1    6     21.4     175
2    8     19.2     335
>>> by_cyl >> filter(_.mpg == _.mpg.max())
(grouped data frame)
    cyl   mpg   hp
3     6  21.4  110
19    4  33.9   65
24    8  19.2  175
>>> cars >> group_by(cyl2 = _.cyl + 1) >> head(2)
(grouped data frame)
   cyl   mpg   hp  cyl2
0    6  21.0  110     7
1    6  21.0  110     7

Note that creating the new grouping column is always performed on ungrouped data. Use an explicit mutate on the grouped data perform the operation within groups.

For example, the code below calls pd.cut on the mpg column, within each cyl group.

>>> from siuba.siu import call
>>> (cars
...     >> group_by(_.cyl)
...     >> mutate(mpg_bin = call(pd.cut, _.mpg, 3))
...     >> group_by(_.mpg_bin, add=True)
...     >> head(2)
... )
(grouped data frame)
   cyl   mpg   hp       mpg_bin
0    6  21.0  110  (20.2, 21.4]
1    6  21.0  110  (20.2, 21.4]
Source code in siuba/dply/verbs.py
@singledispatch2((pd.DataFrame, DataFrameGroupBy))
def group_by(__data, *args, add = False, **kwargs):
    """Return a grouped DataFrame, using columns or expressions to define groups.

    Any operations (e.g. summarize, mutate, filter) performed on grouped data
    will be performed "by group". Use `ungroup()` to remove the groupings.

    Parameters
    ----------
    __data:
        The data being grouped.
    *args:
        Lazy expressions used to select the grouping columns. Currently, each
        arg must refer to a single columns (e.g. _.cyl, _.mpg).
    add: bool
        If the data is already grouped, whether to add these groupings on top of those.
    **kwargs:
        Keyword arguments define new columns used to group the data.


    Examples
    --------

    >>> from siuba import _, group_by, summarize, filter, mutate, head
    >>> from siuba.data import cars

    >>> by_cyl = cars >> group_by(_.cyl)

    >>> by_cyl >> summarize(max_mpg = _.mpg.max(), max_hp = _.hp.max())
       cyl  max_mpg  max_hp
    0    4     33.9     113
    1    6     21.4     175
    2    8     19.2     335

    >>> by_cyl >> filter(_.mpg == _.mpg.max())
    (grouped data frame)
        cyl   mpg   hp
    3     6  21.4  110
    19    4  33.9   65
    24    8  19.2  175

    >>> cars >> group_by(cyl2 = _.cyl + 1) >> head(2)
    (grouped data frame)
       cyl   mpg   hp  cyl2
    0    6  21.0  110     7
    1    6  21.0  110     7

    Note that creating the new grouping column is always performed on ungrouped data.
    Use an explicit mutate on the grouped data perform the operation within groups.

    For example, the code below calls pd.cut on the mpg column, within each cyl group.

    >>> from siuba.siu import call
    >>> (cars
    ...     >> group_by(_.cyl)
    ...     >> mutate(mpg_bin = call(pd.cut, _.mpg, 3))
    ...     >> group_by(_.mpg_bin, add=True)
    ...     >> head(2)
    ... )
    (grouped data frame)
       cyl   mpg   hp       mpg_bin
    0    6  21.0  110  (20.2, 21.4]
    1    6  21.0  110  (20.2, 21.4]

    """

    if isinstance(__data, DataFrameGroupBy):
        tmp_df = __data.obj.copy()
    else:
        tmp_df = __data.copy()

    # TODO: super inefficient, since it makes multiple copies of data
    #       need way to get the by_vars and apply (grouped) computation
    computed = transmute(tmp_df, *args, **kwargs)
    by_vars = list(computed.columns)

    for k in by_vars:
        tmp_df[k] = computed[k]

    if isinstance(__data, DataFrameGroupBy) and add:
        groupings = {el.name: el for el in __data.grouper.groupings}

        for varname in by_vars:
            # ensures group levels are recalculated if varname was in transmute
            groupings[varname] = varname

        return tmp_df.groupby(list(groupings.values()), dropna=False, group_keys=True)

    return tmp_df.groupby(by = by_vars, dropna=False, group_keys=True)

ungroup(__data)

Return an ungrouped DataFrame.

Parameters:

Name Type Description Default
__data

The data being ungrouped.

required

Examples:

>>> from siuba import _, group_by, ungroup
>>> from siuba.data import cars
>>> g_cyl = cars.groupby("cyl")
>>> res1 = ungroup(g_cyl)
>>> res2 = cars >> group_by(_.cyl) >> ungroup()
Source code in siuba/dply/verbs.py
@singledispatch2((pd.DataFrame, DataFrameGroupBy))
def ungroup(__data):
    """Return an ungrouped DataFrame.

    Parameters
    ----------
    __data:
        The data being ungrouped.

    Examples
    --------
    >>> from siuba import _, group_by, ungroup
    >>> from siuba.data import cars

    >>> g_cyl = cars.groupby("cyl")
    >>> res1 = ungroup(g_cyl)

    >>> res2 = cars >> group_by(_.cyl) >> ungroup()
    """
    # TODO: can we somehow just restore the original df used to construct
    #       the groupby?
    if isinstance(__data, pd.DataFrame):
        return __data
    elif isinstance(__data, DataFrameGroupBy):
        return __data.obj
    else:
        raise TypeError(f"Unsupported type {type(__data)}")