from siuba.siu import symbolic_dispatch
from pandas.core.groupby import SeriesGroupBy
from pandas import Series
@symbolic_dispatch
def all_like(col, text):
raise NotImplementedError(f"Not implemented for class {type(col)}")
@all_like.register
def _all_like_ser(col: Series, text: str) -> Series:
"""Return transformation. Checks whether text is in each entry of col."""
return col.str.contains(text).all()
Custom column ops
Use symbolic_dispatch()
to create new functions for operating on columns.
This function creates what are called single generic functionsโ which let you register different ways to handle different types of data.
Define a new function
Check for a translation
all_like.dispatch(Series)
<function __main__._all_like_ser(col: pandas.core.series.Series, text: str) -> pandas.core.series.Series>
Register an error with FunctionLookupBound
from siuba.siu import FunctionLookupBound
@symbolic_dispatch
def some_func(x):
return 1
"Not implemented")) some_func.register(Series, FunctionLookupBound(
<siuba.siu.visitors.FunctionLookupBound at 0x7fdf5ca09850>
= some_func.dispatch(Series)
f_concrete
# indicates that a function is *not* implemented
isinstance(f_concrete, FunctionLookupBound)
True
Register a SQL translation
from sqlalchemy import sql
from siuba.sql.dialects.postgresql import PostgresqlColumn, PostgresqlColumnAgg
from siuba.sql.translate import AggOver
Transformation
@all_like.register
def _all_like_pg(
codata: PostgresqlColumn,
col: sql.ClauseElement,str
text: -> sql.ClauseElement:
) return AggOver(sql.func.bool_and(col.like(text)))
= all_like(PostgresqlColumn(), sql.column("a"), "yo")
expr print(expr)
bool_and(a LIKE :a_1) OVER ()
Note that AggOver
ensures the result is a transformation by using an OVER
clause. The partition and ordering of the clause are set automatically by siuba verbs.
There are three special over clauses:
AggOver
: handle an aggregate (e.g.AVG(hp)
->AVG(hp) OVER(...)
).RankOver
: handle a ranking function (e.g.RANK() OVER (... ORDER BY ...)
).CumlOver
: handle a cumulative function (e.g.SUM(hp) OVER (... ORDER BY ...)
).
Aggregation
@all_like.register
def _like_pg_agg(
codata: PostgresqlColumnAgg,
col: sql.ClauseElement,str
text: -> sql.ClauseElement:
) return sql.func.bool_and(col.like(text))
= all_like(PostgresqlColumnAgg(), sql.column("a"), "yo")
expr print(expr)
bool_and(a LIKE :a_1)
Call functions in functions
Use the codata
parameter when calling another generic function inside your custom function.
# these are the functions used to translate pandas methods
from siuba.ops import mean, std
@symbolic_dispatch(cls = PostgresqlColumn)
def scale(codata, x):
return (x - mean(codata, x)) / std(codata, x)
from siuba.data import cars_sql
from siuba import _, mutate, group_by, show_query
= mutate(cars_sql, res = scale(_.mpg)) >> show_query(simplify=True) q
SELECT *, (cars.mpg - avg(cars.mpg) OVER ()) / stddev_samp(cars.mpg) OVER () AS res
FROM cars
= cars_sql >> group_by(_.cyl) >> mutate(res = scale(_.mpg)) >> show_query(simplify=True) q
SELECT *, (cars.mpg - avg(cars.mpg) OVER (PARTITION BY cars.cyl)) / stddev_samp(cars.mpg) OVER (PARTITION BY cars.cyl) AS res
FROM cars