from siuba.siu import symbolic_dispatch
from pandas.core.groupby import SeriesGroupBy
from pandas import Series
@symbolic_dispatch
def all_like(col, text):
raise NotImplementedError(f"Not implemented for class {type(col)}")
@all_like.register
def _all_like_ser(col: Series, text: str) -> Series:
"""Return transformation. Checks whether text is in each entry of col."""
return col.str.contains(text).all()Custom column ops
Use symbolic_dispatch() to create new functions for operating on columns.
This function creates what are called single generic functionsโ which let you register different ways to handle different types of data.
Define a new function
Check for a translation
all_like.dispatch(Series)<function __main__._all_like_ser(col: pandas.core.series.Series, text: str) -> pandas.core.series.Series>
Register an error with FunctionLookupBound
from siuba.siu import FunctionLookupBound
@symbolic_dispatch
def some_func(x):
return 1
some_func.register(Series, FunctionLookupBound("Not implemented"))<siuba.siu.visitors.FunctionLookupBound at 0x7fdf5ca09850>
f_concrete = some_func.dispatch(Series)
# indicates that a function is *not* implemented
isinstance(f_concrete, FunctionLookupBound)True
Register a SQL translation
from sqlalchemy import sql
from siuba.sql.dialects.postgresql import PostgresqlColumn, PostgresqlColumnAgg
from siuba.sql.translate import AggOverTransformation
@all_like.register
def _all_like_pg(
codata: PostgresqlColumn,
col: sql.ClauseElement,
text: str
) -> sql.ClauseElement:
return AggOver(sql.func.bool_and(col.like(text)))expr = all_like(PostgresqlColumn(), sql.column("a"), "yo")
print(expr)bool_and(a LIKE :a_1) OVER ()
Note that AggOver ensures the result is a transformation by using an OVER clause. The partition and ordering of the clause are set automatically by siuba verbs.
There are three special over clauses:
AggOver: handle an aggregate (e.g.AVG(hp)->AVG(hp) OVER(...)).RankOver: handle a ranking function (e.g.RANK() OVER (... ORDER BY ...)).CumlOver: handle a cumulative function (e.g.SUM(hp) OVER (... ORDER BY ...)).
Aggregation
@all_like.register
def _like_pg_agg(
codata: PostgresqlColumnAgg,
col: sql.ClauseElement,
text: str
) -> sql.ClauseElement:
return sql.func.bool_and(col.like(text))
expr = all_like(PostgresqlColumnAgg(), sql.column("a"), "yo")
print(expr)bool_and(a LIKE :a_1)
Call functions in functions
Use the codata parameter when calling another generic function inside your custom function.
# these are the functions used to translate pandas methods
from siuba.ops import mean, std
@symbolic_dispatch(cls = PostgresqlColumn)
def scale(codata, x):
return (x - mean(codata, x)) / std(codata, x)from siuba.data import cars_sql
from siuba import _, mutate, group_by, show_query
q = mutate(cars_sql, res = scale(_.mpg)) >> show_query(simplify=True)SELECT *, (cars.mpg - avg(cars.mpg) OVER ()) / stddev_samp(cars.mpg) OVER () AS res
FROM cars
q = cars_sql >> group_by(_.cyl) >> mutate(res = scale(_.mpg)) >> show_query(simplify=True)SELECT *, (cars.mpg - avg(cars.mpg) OVER (PARTITION BY cars.cyl)) / stddev_samp(cars.mpg) OVER (PARTITION BY cars.cyl) AS res
FROM cars