Custom column ops

Use symbolic_dispatch() to create new functions for operating on columns.

This function creates what are called single generic functionsโ€” which let you register different ways to handle different types of data.

Define a new function

from siuba.siu import symbolic_dispatch
from pandas.core.groupby import SeriesGroupBy
from pandas import Series

@symbolic_dispatch
def all_like(col, text):
    raise NotImplementedError(f"Not implemented for class {type(col)}")

@all_like.register
def _all_like_ser(col: Series, text: str) -> Series:
    """Return transformation. Checks whether text is in each entry of col."""
    return col.str.contains(text).all()

Check for a translation

all_like.dispatch(Series)
<function __main__._all_like_ser(col: pandas.core.series.Series, text: str) -> pandas.core.series.Series>

Register an error with FunctionLookupBound

from siuba.siu import FunctionLookupBound

@symbolic_dispatch
def some_func(x):
    return 1

some_func.register(Series, FunctionLookupBound("Not implemented"))
<siuba.siu.visitors.FunctionLookupBound at 0x7fdf5ca09850>
f_concrete = some_func.dispatch(Series)

# indicates that a function is *not* implemented
isinstance(f_concrete, FunctionLookupBound)
True

Register a SQL translation

from sqlalchemy import sql

from siuba.sql.dialects.postgresql import PostgresqlColumn, PostgresqlColumnAgg
from siuba.sql.translate import AggOver

Transformation

@all_like.register
def _all_like_pg(
    codata: PostgresqlColumn,
    col: sql.ClauseElement,
    text: str
) -> sql.ClauseElement:
    return AggOver(sql.func.bool_and(col.like(text)))
expr = all_like(PostgresqlColumn(), sql.column("a"), "yo")
print(expr)
bool_and(a LIKE :a_1) OVER ()

Note that AggOver ensures the result is a transformation by using an OVER clause. The partition and ordering of the clause are set automatically by siuba verbs.

There are three special over clauses:

  • AggOver: handle an aggregate (e.g. AVG(hp) -> AVG(hp) OVER(...)).
  • RankOver: handle a ranking function (e.g. RANK() OVER (... ORDER BY ...)).
  • CumlOver: handle a cumulative function (e.g. SUM(hp) OVER (... ORDER BY ...)).

Aggregation

@all_like.register
def _like_pg_agg(
    codata: PostgresqlColumnAgg,
    col: sql.ClauseElement,
    text: str
) -> sql.ClauseElement:
    return sql.func.bool_and(col.like(text))


expr = all_like(PostgresqlColumnAgg(), sql.column("a"), "yo")
print(expr)
bool_and(a LIKE :a_1)

Call functions in functions

Use the codata parameter when calling another generic function inside your custom function.

# these are the functions used to translate pandas methods
from siuba.ops import mean, std

@symbolic_dispatch(cls = PostgresqlColumn)
def scale(codata, x):
    return (x - mean(codata, x)) / std(codata, x)
from siuba.data import cars_sql
from siuba import _, mutate, group_by, show_query

q = mutate(cars_sql, res = scale(_.mpg)) >> show_query(simplify=True)
SELECT *, (cars.mpg - avg(cars.mpg) OVER ()) / stddev_samp(cars.mpg) OVER () AS res 
FROM cars
q = cars_sql >> group_by(_.cyl) >> mutate(res = scale(_.mpg)) >> show_query(simplify=True)
SELECT *, (cars.mpg - avg(cars.mpg) OVER (PARTITION BY cars.cyl)) / stddev_samp(cars.mpg) OVER (PARTITION BY cars.cyl) AS res 
FROM cars