Skip to content

Threshold Segmentation

Threshold-Based Customer Segmentation Module.

This module provides the ThresholdSegmentation class, which segments customers based on user-defined thresholds and segment mappings.

Key Features: - Segments customers based on specified percentile thresholds. - Uses a specified column for segmentation, with an aggregation function applied. - Handles customers with zero spend using configurable options. - Utilizes Ibis for efficient query execution.

ThresholdSegmentation

Bases: BaseSegmentation

Segments customers based on user-defined thresholds and segments.

Source code in pyretailscience/segmentation/threshold.py
class ThresholdSegmentation(BaseSegmentation):
    """Segments customers based on user-defined thresholds and segments."""

    _df: pd.DataFrame | None = None

    def __init__(
        self,
        df: pd.DataFrame | ibis.Table,
        thresholds: list[float],
        segments: list[str],
        value_col: str | None = None,
        agg_func: str = "sum",
        zero_segment_name: str = "Zero",
        zero_value_customers: Literal["separate_segment", "exclude", "include_with_light"] = "separate_segment",
    ) -> None:
        """Segments customers based on user-defined thresholds and segments.

        Args:
            df (pd.DataFrame | ibis.Table): A dataframe with the transaction data. The dataframe must contain a customer_id column.
            thresholds (List[float]): The percentile thresholds for segmentation.
            segments (List[str]): A list of segment names for each threshold.
            value_col (str, optional): The column to use for the segmentation. Defaults to get_option("column.unit_spend").
            agg_func (str, optional): The aggregation function to use when grouping by customer_id. Defaults to "sum".
            zero_segment_name (str, optional): The name of the segment for customers with zero spend. Defaults to "Zero".
            zero_value_customers (Literal["separate_segment", "exclude", "include_with_light"], optional): How to handle
                customers with zero spend. Defaults to "separate_segment".

        Raises:
            ValueError: If the dataframe is missing the columns option column.customer_id or `value_col`, or these
                columns contain null values.
        """
        if len(thresholds) != len(set(thresholds)):
            raise ValueError("The thresholds must be unique.")

        if len(thresholds) != len(segments):
            raise ValueError("The number of thresholds must match the number of segments.")

        if isinstance(df, pd.DataFrame):
            df: ibis.Table = ibis.memtable(df)

        value_col = get_option("column.unit_spend") if value_col is None else value_col

        required_cols = [get_option("column.customer_id"), value_col]

        missing_cols = set(required_cols) - set(df.columns)
        if len(missing_cols) > 0:
            msg = f"The following columns are required but missing: {missing_cols}"
            raise ValueError(msg)

        df = df.group_by(get_option("column.customer_id")).aggregate(
            **{value_col: getattr(df[value_col], agg_func)()},
        )

        # Separate customers with zero spend
        zero_df = None
        if zero_value_customers == "exclude":
            df = df.filter(df[value_col] != 0)
        elif zero_value_customers == "separate_segment":
            zero_df = df.filter(df[value_col] == 0).mutate(segment_name=ibis.literal(zero_segment_name))
            df = df.filter(df[value_col] != 0)

        window = ibis.window(order_by=ibis.asc(df[value_col]))
        df = df.mutate(ptile=ibis.percent_rank().over(window))

        case = ibis.case()

        for quantile, segment in zip(thresholds, segments, strict=True):
            case = case.when(df["ptile"] <= quantile, segment)

        case = case.end()

        df = df.mutate(segment_name=case).drop(["ptile"])

        if zero_value_customers == "separate_segment":
            df = ibis.union(df, zero_df)

        self.table = df

    @property
    def df(self) -> pd.DataFrame:
        """Returns the dataframe with the segment names."""
        if self._df is None:
            self._df = self.table.execute().set_index(get_option("column.customer_id"))
        return self._df

df: pd.DataFrame property

Returns the dataframe with the segment names.

__init__(df, thresholds, segments, value_col=None, agg_func='sum', zero_segment_name='Zero', zero_value_customers='separate_segment')

Segments customers based on user-defined thresholds and segments.

Parameters:

Name Type Description Default
df DataFrame | Table

A dataframe with the transaction data. The dataframe must contain a customer_id column.

required
thresholds List[float]

The percentile thresholds for segmentation.

required
segments List[str]

A list of segment names for each threshold.

required
value_col str

The column to use for the segmentation. Defaults to get_option("column.unit_spend").

None
agg_func str

The aggregation function to use when grouping by customer_id. Defaults to "sum".

'sum'
zero_segment_name str

The name of the segment for customers with zero spend. Defaults to "Zero".

'Zero'
zero_value_customers Literal['separate_segment', 'exclude', 'include_with_light']

How to handle customers with zero spend. Defaults to "separate_segment".

'separate_segment'

Raises:

Type Description
ValueError

If the dataframe is missing the columns option column.customer_id or value_col, or these columns contain null values.

Source code in pyretailscience/segmentation/threshold.py
def __init__(
    self,
    df: pd.DataFrame | ibis.Table,
    thresholds: list[float],
    segments: list[str],
    value_col: str | None = None,
    agg_func: str = "sum",
    zero_segment_name: str = "Zero",
    zero_value_customers: Literal["separate_segment", "exclude", "include_with_light"] = "separate_segment",
) -> None:
    """Segments customers based on user-defined thresholds and segments.

    Args:
        df (pd.DataFrame | ibis.Table): A dataframe with the transaction data. The dataframe must contain a customer_id column.
        thresholds (List[float]): The percentile thresholds for segmentation.
        segments (List[str]): A list of segment names for each threshold.
        value_col (str, optional): The column to use for the segmentation. Defaults to get_option("column.unit_spend").
        agg_func (str, optional): The aggregation function to use when grouping by customer_id. Defaults to "sum".
        zero_segment_name (str, optional): The name of the segment for customers with zero spend. Defaults to "Zero".
        zero_value_customers (Literal["separate_segment", "exclude", "include_with_light"], optional): How to handle
            customers with zero spend. Defaults to "separate_segment".

    Raises:
        ValueError: If the dataframe is missing the columns option column.customer_id or `value_col`, or these
            columns contain null values.
    """
    if len(thresholds) != len(set(thresholds)):
        raise ValueError("The thresholds must be unique.")

    if len(thresholds) != len(segments):
        raise ValueError("The number of thresholds must match the number of segments.")

    if isinstance(df, pd.DataFrame):
        df: ibis.Table = ibis.memtable(df)

    value_col = get_option("column.unit_spend") if value_col is None else value_col

    required_cols = [get_option("column.customer_id"), value_col]

    missing_cols = set(required_cols) - set(df.columns)
    if len(missing_cols) > 0:
        msg = f"The following columns are required but missing: {missing_cols}"
        raise ValueError(msg)

    df = df.group_by(get_option("column.customer_id")).aggregate(
        **{value_col: getattr(df[value_col], agg_func)()},
    )

    # Separate customers with zero spend
    zero_df = None
    if zero_value_customers == "exclude":
        df = df.filter(df[value_col] != 0)
    elif zero_value_customers == "separate_segment":
        zero_df = df.filter(df[value_col] == 0).mutate(segment_name=ibis.literal(zero_segment_name))
        df = df.filter(df[value_col] != 0)

    window = ibis.window(order_by=ibis.asc(df[value_col]))
    df = df.mutate(ptile=ibis.percent_rank().over(window))

    case = ibis.case()

    for quantile, segment in zip(thresholds, segments, strict=True):
        case = case.when(df["ptile"] <= quantile, segment)

    case = case.end()

    df = df.mutate(segment_name=case).drop(["ptile"])

    if zero_value_customers == "separate_segment":
        df = ibis.union(df, zero_df)

    self.table = df