Skip to main content

Course Progress

Loading...

GROUP BY & HAVING - Data Grouping Mastery

Duration: 90 minutes
Module 3: Session 4

Learning Objectives

  • Master GROUP BY fundamentals and execution order
  • Group by single and multiple columns
  • Understand the difference between WHERE and HAVING
  • Use GROUP BY with aggregate functions
  • Implement ROLLUP and CUBE for subtotals
  • Apply GROUPING SETS for flexible grouping
  • Handle NULL values in grouping
  • Optimize GROUP BY performance

Understanding GROUP BY

GROUP BY transforms your data from individual rows into meaningful groups, enabling powerful data analysis and reporting! 📊

💡
GROUP BY Golden Rules
  • Every column in SELECT must be either in GROUP BY or an aggregate function
  • GROUP BY creates one row per unique group value
  • NULL values form their own group
  • Order of columns in GROUP BY matters for performance
  • Can group by expressions, not just columns

Basic GROUP BY Operations

graph TB GROUP[GROUP BY] --> SINGLE[Single Column] GROUP --> MULTI[Multiple Columns] GROUP --> EXPR[Expressions] GROUP --> POS[Position] SINGLE --> S1[Simple grouping] MULTI --> M1[Hierarchical groups] EXPR --> E1[Calculated groups] POS --> P1[GROUP BY 1, 2] style GROUP fill:#dcfce7 style SINGLE fill:#dbeafe style MULTI fill:#fef3c7 style EXPR fill:#fee2e2

GROUP BY Fundamentals

-- ========================================
-- SINGLE COLUMN GROUPING
-- ========================================

-- Basic GROUP BY with COUNT
SELECT 
    category,
    COUNT(*) AS product_count
FROM products
GROUP BY category;

-- GROUP BY with multiple aggregates
SELECT 
    customer_id,
    COUNT(*) AS order_count,
    SUM(amount) AS total_spent,
    AVG(amount) AS avg_order_value,
    MIN(order_date) AS first_order,
    MAX(order_date) AS last_order
FROM orders
GROUP BY customer_id;

-- GROUP BY with JOIN
SELECT 
    c.country,
    COUNT(DISTINCT c.customer_id) AS customers,
    COUNT(o.order_id) AS orders,
    SUM(o.amount) AS revenue
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.country
ORDER BY revenue DESC;

-- ========================================
-- MULTIPLE COLUMN GROUPING
-- ========================================

-- Hierarchical grouping
SELECT 
    YEAR(order_date) AS year,
    MONTH(order_date) AS month,
    category,
    COUNT(*) AS sales_count,
    SUM(amount) AS revenue
FROM orders o
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p ON oi.product_id = p.product_id
GROUP BY 
    YEAR(order_date),
    MONTH(order_date),
    category
ORDER BY year DESC, month DESC, revenue DESC;

-- Multi-level aggregation
SELECT 
    country,
    state,
    city,
    COUNT(*) AS customer_count,
    AVG(lifetime_value) AS avg_ltv
FROM customers
GROUP BY country, state, city
WITH ROLLUP;  -- Adds subtotals

-- ========================================
-- GROUP BY EXPRESSIONS
-- ========================================

-- Group by calculated values
SELECT 
    CASE 
        WHEN amount < 100 THEN 'Small'
        WHEN amount < 500 THEN 'Medium'
        WHEN amount < 1000 THEN 'Large'
        ELSE 'Enterprise'
    END AS order_size,
    COUNT(*) AS order_count,
    AVG(amount) AS avg_amount
FROM orders
GROUP BY 
    CASE 
        WHEN amount < 100 THEN 'Small'
        WHEN amount < 500 THEN 'Medium'
        WHEN amount < 1000 THEN 'Large'
        ELSE 'Enterprise'
    END;

-- Group by date parts
SELECT 
    DATE_FORMAT(order_date, '%Y-%m') AS month,
    DAYOFWEEK(order_date) AS day_of_week,
    CASE DAYOFWEEK(order_date)
        WHEN 1 THEN 'Sunday'
        WHEN 2 THEN 'Monday'
        WHEN 3 THEN 'Tuesday'
        WHEN 4 THEN 'Wednesday'
        WHEN 5 THEN 'Thursday'
        WHEN 6 THEN 'Friday'
        WHEN 7 THEN 'Saturday'
    END AS day_name,
    COUNT(*) AS orders,
    SUM(amount) AS revenue
FROM orders
GROUP BY 
    DATE_FORMAT(order_date, '%Y-%m'),
    DAYOFWEEK(order_date)
ORDER BY month, day_of_week;

-- ========================================
-- GROUP BY WITH NULL HANDLING
-- ========================================

-- NULLs form their own group
SELECT 
    COALESCE(category, 'Uncategorized') AS category_name,
    COUNT(*) AS product_count,
    AVG(price) AS avg_price
FROM products
GROUP BY category;  -- NULL category becomes a group

-- Exclude NULL groups
SELECT 
    category,
    COUNT(*) AS count
FROM products
WHERE category IS NOT NULL
GROUP BY category;

-- ========================================
-- GROUP BY POSITION (Less Readable)
-- ========================================

-- Group by column position
SELECT 
    category,
    brand,
    COUNT(*) AS count
FROM products
GROUP BY 1, 2;  -- Groups by first and second column

-- Better to use column names for clarity
SELECT 
    category,
    brand,
    COUNT(*) AS count
FROM products
GROUP BY category, brand;

HAVING Clause - Filtering Groups

HAVING Clause Examples

-- ========================================
-- BASIC HAVING EXAMPLES
-- ========================================

-- Filter groups by count
SELECT 
    category,
    COUNT(*) AS product_count
FROM products
GROUP BY category
HAVING COUNT(*) > 10;

-- Multiple conditions in HAVING
SELECT 
    customer_id,
    COUNT(*) AS order_count,
    SUM(amount) AS total_spent,
    AVG(amount) AS avg_order
FROM orders
GROUP BY customer_id
HAVING 
    COUNT(*) >= 5
    AND SUM(amount) > 1000
    AND AVG(amount) > 100;

-- ========================================
-- WHERE AND HAVING TOGETHER
-- ========================================

-- Best practice: Filter early with WHERE
SELECT 
    c.country,
    COUNT(DISTINCT c.customer_id) AS customers,
    SUM(o.amount) AS revenue
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id
WHERE 
    o.status = 'completed'  -- WHERE: filter individual orders
    AND o.order_date >= '2024-01-01'
GROUP BY c.country
HAVING 
    COUNT(DISTINCT c.customer_id) > 100  -- HAVING: filter country groups
    AND SUM(o.amount) > 10000
ORDER BY revenue DESC;

-- ========================================
-- HAVING WITH AGGREGATE COMPARISONS
-- ========================================

-- Groups above average
SELECT 
    category,
    AVG(price) AS category_avg_price
FROM products
GROUP BY category
HAVING AVG(price) > (
    SELECT AVG(price) FROM products
);

-- Relative comparisons
SELECT 
    salesperson_id,
    SUM(sale_amount) AS total_sales
FROM sales
WHERE YEAR(sale_date) = 2024
GROUP BY salesperson_id
HAVING SUM(sale_amount) > (
    SELECT AVG(total_sales) * 1.2  -- 20% above average
    FROM (
        SELECT SUM(sale_amount) AS total_sales
        FROM sales
        WHERE YEAR(sale_date) = 2024
        GROUP BY salesperson_id
    ) AS avg_calc
);

-- ========================================
-- HAVING WITHOUT GROUP BY
-- ========================================

-- HAVING can be used without GROUP BY (treats all rows as one group)
SELECT 
    COUNT(*) AS total_products,
    AVG(price) AS avg_price
FROM products
HAVING AVG(price) > 50;  -- Filters the single group

-- ========================================
-- COMPLEX HAVING PATTERNS
-- ========================================

-- Find duplicate records
SELECT 
    email,
    COUNT(*) AS duplicate_count
FROM customers
GROUP BY email
HAVING COUNT(*) > 1;

-- Categories with diverse price ranges
SELECT 
    category,
    COUNT(*) AS products,
    MIN(price) AS min_price,
    MAX(price) AS max_price,
    MAX(price) - MIN(price) AS price_range
FROM products
GROUP BY category
HAVING 
    MAX(price) - MIN(price) > 500
    AND COUNT(*) > 5
ORDER BY price_range DESC;

-- Customers with irregular ordering patterns
SELECT 
    customer_id,
    COUNT(*) AS order_count,
    STD(amount) AS order_std_dev,
    AVG(amount) AS avg_amount
FROM orders
GROUP BY customer_id
HAVING 
    STD(amount) > AVG(amount) * 0.5  -- High variation
    AND COUNT(*) > 10;

-- Time-based group filtering
SELECT 
    DATE_FORMAT(order_date, '%Y-%m') AS month,
    COUNT(*) AS orders,
    SUM(amount) AS revenue
FROM orders
GROUP BY DATE_FORMAT(order_date, '%Y-%m')
HAVING 
    COUNT(*) > (
        SELECT AVG(monthly_orders)
        FROM (
            SELECT COUNT(*) AS monthly_orders
            FROM orders
            GROUP BY DATE_FORMAT(order_date, '%Y-%m')
        ) AS monthly_avg
    );

Advanced Grouping: ROLLUP, CUBE, and GROUPING SETS

⚠️
MySQL Support Note
MySQL supports ROLLUP but not CUBE or GROUPING SETS. We'll show workarounds for missing features.

Advanced Grouping Techniques

-- ========================================
-- ROLLUP - Hierarchical Subtotals
-- ========================================

-- Simple ROLLUP
SELECT 
    category,
    COUNT(*) AS products,
    SUM(stock * price) AS inventory_value
FROM products
GROUP BY category WITH ROLLUP;

-- Multi-level ROLLUP
SELECT 
    COALESCE(YEAR(order_date), 'Grand Total') AS year,
    COALESCE(MONTH(order_date), 'Year Total') AS month,
    COALESCE(category, 'Month Total') AS category,
    COUNT(*) AS orders,
    SUM(amount) AS revenue
FROM orders o
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p ON oi.product_id = p.product_id
WHERE o.status = 'completed'
GROUP BY 
    YEAR(order_date),
    MONTH(order_date),
    category 
WITH ROLLUP;

-- ROLLUP with GROUPING() function
SELECT 
    category,
    brand,
    COUNT(*) AS count,
    SUM(price) AS total_price,
    CASE 
        WHEN GROUPING(category) = 1 THEN 'GRAND TOTAL'
        WHEN GROUPING(brand) = 1 THEN 'Category Subtotal'
        ELSE 'Detail'
    END AS level_type
FROM products
GROUP BY category, brand WITH ROLLUP;

-- ========================================
-- CUBE Alternative (MySQL doesn't support CUBE)
-- ========================================

-- Simulate CUBE with UNION
-- CUBE would give all possible grouping combinations

-- All combinations for (A, B)
-- 1. GROUP BY A, B
-- 2. GROUP BY A
-- 3. GROUP BY B
-- 4. Grand total

(SELECT 
    region,
    product_category,
    SUM(sales_amount) AS total_sales,
    'Region-Category' AS grouping_level
FROM sales
GROUP BY region, product_category)

UNION ALL

(SELECT 
    region,
    NULL AS product_category,
    SUM(sales_amount) AS total_sales,
    'Region Only' AS grouping_level
FROM sales
GROUP BY region)

UNION ALL

(SELECT 
    NULL AS region,
    product_category,
    SUM(sales_amount) AS total_sales,
    'Category Only' AS grouping_level
FROM sales
GROUP BY product_category)

UNION ALL

(SELECT 
    NULL AS region,
    NULL AS product_category,
    SUM(sales_amount) AS total_sales,
    'Grand Total' AS grouping_level
FROM sales)

ORDER BY 
    COALESCE(region, 'ZZZZZ'),
    COALESCE(product_category, 'ZZZZZ');

-- ========================================
-- GROUPING SETS Alternative (MySQL workaround)
-- ========================================

-- Simulate GROUPING SETS with UNION
-- Different grouping combinations as needed

(SELECT 
    'By Year-Month' AS group_type,
    YEAR(order_date) AS year,
    MONTH(order_date) AS month,
    NULL AS category,
    COUNT(*) AS orders,
    SUM(amount) AS revenue
FROM orders
GROUP BY YEAR(order_date), MONTH(order_date))

UNION ALL

(SELECT 
    'By Category' AS group_type,
    NULL AS year,
    NULL AS month,
    category,
    COUNT(*) AS orders,
    SUM(amount) AS revenue
FROM orders o
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p ON oi.product_id = p.product_id
GROUP BY category)

UNION ALL

(SELECT 
    'Grand Total' AS group_type,
    NULL AS year,
    NULL AS month,
    NULL AS category,
    COUNT(*) AS orders,
    SUM(amount) AS revenue
FROM orders)

ORDER BY group_type, year, month, category;

-- ========================================
-- Practical ROLLUP Applications
-- ========================================

-- Sales hierarchy report
SELECT 
    COALESCE(region, 'ALL REGIONS') AS region,
    COALESCE(country, 'All Countries') AS country,
    COALESCE(state, 'All States') AS state,
    COUNT(DISTINCT customer_id) AS customers,
    COUNT(order_id) AS orders,
    FORMAT(SUM(amount), 2) AS revenue,
    FORMAT(AVG(amount), 2) AS avg_order_value
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
WHERE o.status = 'completed'
GROUP BY region, country, state WITH ROLLUP
HAVING region IS NOT NULL OR (region IS NULL AND country IS NULL AND state IS NULL);

-- Inventory summary with subtotals
SELECT 
    COALESCE(warehouse, 'ALL WAREHOUSES') AS warehouse,
    COALESCE(category, 'All Categories') AS category,
    COUNT(DISTINCT product_id) AS products,
    SUM(quantity_on_hand) AS total_units,
    FORMAT(SUM(quantity_on_hand * unit_cost), 2) AS inventory_value,
    CASE 
        WHEN GROUPING(warehouse) = 1 THEN '*** GRAND TOTAL ***'
        WHEN GROUPING(category) = 1 THEN '** Warehouse Total **'
        ELSE ''
    END AS summary_level
FROM inventory
GROUP BY warehouse, category WITH ROLLUP;

GROUP BY Performance Optimization

Optimizing GROUP BY Queries

-- ========================================
-- INDEX OPTIMIZATION FOR GROUP BY
-- ========================================

-- Create index for GROUP BY columns
CREATE INDEX idx_groupby ON orders(customer_id, status, order_date);

-- Query uses index for grouping
EXPLAIN SELECT 
    customer_id,
    status,
    COUNT(*),
    SUM(amount)
FROM orders
GROUP BY customer_id, status;

-- Covering index includes all needed columns
CREATE INDEX idx_covering ON orders(
    customer_id, 
    status, 
    amount
);

-- ========================================
-- OPTIMIZE GROUP BY ORDER
-- ========================================

-- Match GROUP BY order to index order
-- Good: Matches index (customer_id, status, order_date)
SELECT customer_id, status, DATE(order_date), COUNT(*)
FROM orders
GROUP BY customer_id, status, DATE(order_date);

-- Bad: Different order than index
SELECT DATE(order_date), customer_id, status, COUNT(*)
FROM orders
GROUP BY DATE(order_date), customer_id, status;

-- ========================================
-- REDUCE GROUPED DATA VOLUME
-- ========================================

-- Filter before grouping (use WHERE not HAVING when possible)
-- Efficient:
SELECT category, COUNT(*)
FROM products
WHERE price > 100  -- Filters first
GROUP BY category;

-- Inefficient:
SELECT category, COUNT(*)
FROM products
GROUP BY category
HAVING AVG(price) > 100;  -- Filters after grouping

-- ========================================
-- SUMMARY TABLES FOR PERFORMANCE
-- ========================================

-- Create materialized summary table
CREATE TABLE daily_sales_summary (
    sale_date DATE PRIMARY KEY,
    total_orders INT,
    total_revenue DECIMAL(10,2),
    unique_customers INT,
    INDEX idx_date (sale_date)
);

-- Populate summary table (run periodically)
INSERT INTO daily_sales_summary
SELECT 
    DATE(order_date),
    COUNT(*),
    SUM(amount),
    COUNT(DISTINCT customer_id)
FROM orders
WHERE DATE(order_date) = CURDATE() - INTERVAL 1 DAY
GROUP BY DATE(order_date)
ON DUPLICATE KEY UPDATE
    total_orders = VALUES(total_orders),
    total_revenue = VALUES(total_revenue),
    unique_customers = VALUES(unique_customers);

-- ========================================
-- PARTITION OPTIMIZATION
-- ========================================

-- Partition large tables for better GROUP BY performance
ALTER TABLE orders
PARTITION BY RANGE (YEAR(order_date)) (
    PARTITION p2022 VALUES LESS THAN (2023),
    PARTITION p2023 VALUES LESS THAN (2024),
    PARTITION p2024 VALUES LESS THAN (2025),
    PARTITION p_future VALUES LESS THAN MAXVALUE
);

-- ========================================
-- MONITORING GROUP BY PERFORMANCE
-- ========================================

-- Check temporary table usage
SHOW STATUS LIKE 'Created_tmp%';

-- Analyze slow GROUP BY queries
EXPLAIN FORMAT=JSON
SELECT 
    customer_id,
    YEAR(order_date) AS year,
    COUNT(*) AS orders,
    SUM(amount) AS revenue
FROM orders
GROUP BY customer_id, YEAR(order_date)
HAVING orders > 10;

Practice Exercise

💻
GROUP BY Challenges
Solve these real-world grouping scenarios:
  1. Find the top 3 customers by revenue in each country
  2. Calculate year-over-year growth by category
  3. Identify products that sell better on weekends
  4. Create a cohort analysis by customer signup month
  5. Find categories where all products are above average price
  6. Generate a P&L summary with subtotals using ROLLUP
  7. Detect seasonal patterns in sales data
  8. Build a customer segmentation based on purchase behavior
💡 Click for Solutions
-- 1. Top 3 customers by revenue in each country
WITH customer_rankings AS (
    SELECT 
        c.country,
        c.customer_id,
        c.customer_name,
        SUM(o.amount) AS total_revenue,
        ROW_NUMBER() OVER (
            PARTITION BY c.country 
            ORDER BY SUM(o.amount) DESC
        ) AS rank_in_country
    FROM customers c
    JOIN orders o ON c.customer_id = o.customer_id
    WHERE o.status = 'completed'
    GROUP BY c.country, c.customer_id
)
SELECT *
FROM customer_rankings
WHERE rank_in_country <= 3
ORDER BY country, rank_in_country;

-- 2. Year-over-year growth by category
WITH yearly_sales AS (
    SELECT 
        p.category,
        YEAR(o.order_date) AS year,
        SUM(oi.quantity * oi.unit_price) AS revenue
    FROM orders o
    JOIN order_items oi ON o.order_id = oi.order_id
    JOIN products p ON oi.product_id = p.product_id
    WHERE o.status = 'completed'
    GROUP BY p.category, YEAR(o.order_date)
)
SELECT 
    category,
    year,
    revenue,
    LAG(revenue) OVER (PARTITION BY category ORDER BY year) AS prev_year_revenue,
    ROUND(
        (revenue - LAG(revenue) OVER (PARTITION BY category ORDER BY year)) / 
        LAG(revenue) OVER (PARTITION BY category ORDER BY year) * 100, 
        2
    ) AS yoy_growth_percent
FROM yearly_sales
ORDER BY category, year;

-- 3. Products that sell better on weekends
SELECT 
    p.product_id,
    p.product_name,
    SUM(CASE WHEN DAYOFWEEK(o.order_date) IN (1, 7) THEN oi.quantity ELSE 0 END) AS weekend_sales,
    SUM(CASE WHEN DAYOFWEEK(o.order_date) NOT IN (1, 7) THEN oi.quantity ELSE 0 END) AS weekday_sales,
    ROUND(
        SUM(CASE WHEN DAYOFWEEK(o.order_date) IN (1, 7) THEN oi.quantity ELSE 0 END) * 100.0 /
        SUM(oi.quantity), 2
    ) AS weekend_percentage
FROM products p
JOIN order_items oi ON p.product_id = oi.product_id
JOIN orders o ON oi.order_id = o.order_id
GROUP BY p.product_id
HAVING weekend_percentage > 40  -- More than 40% of sales on weekends
ORDER BY weekend_percentage DESC;

-- 4. Cohort analysis by signup month
SELECT 
    DATE_FORMAT(c.created_at, '%Y-%m') AS cohort_month,
    DATE_FORMAT(o.order_date, '%Y-%m') AS order_month,
    PERIOD_DIFF(
        DATE_FORMAT(o.order_date, '%Y%m'),
        DATE_FORMAT(c.created_at, '%Y%m')
    ) AS months_since_signup,
    COUNT(DISTINCT c.customer_id) AS customers,
    COUNT(o.order_id) AS orders,
    SUM(o.amount) AS revenue
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
GROUP BY cohort_month, order_month
ORDER BY cohort_month, months_since_signup;

-- 5. Categories where all products are above average
SELECT 
    category,
    COUNT(*) AS product_count,
    MIN(price) AS min_price,
    AVG(price) AS avg_price
FROM products
GROUP BY category
HAVING MIN(price) > (SELECT AVG(price) FROM products);

-- 6. P&L summary with ROLLUP
SELECT 
    COALESCE(YEAR(order_date), 'TOTAL') AS year,
    COALESCE(QUARTER(order_date), 'Year Total') AS quarter,
    COALESCE(MONTH(order_date), 'Quarter Total') AS month,
    COUNT(*) AS transactions,
    SUM(revenue) AS gross_revenue,
    SUM(cost) AS total_cost,
    SUM(revenue - cost) AS gross_profit,
    ROUND(SUM(revenue - cost) / SUM(revenue) * 100, 2) AS margin_percent
FROM (
    SELECT 
        o.order_date,
        oi.quantity * oi.unit_price AS revenue,
        oi.quantity * p.cost AS cost
    FROM orders o
    JOIN order_items oi ON o.order_id = oi.order_id
    JOIN products p ON oi.product_id = p.product_id
    WHERE o.status = 'completed'
) AS order_details
GROUP BY YEAR(order_date), QUARTER(order_date), MONTH(order_date) WITH ROLLUP;

-- 7. Seasonal patterns
SELECT 
    MONTH(order_date) AS month_num,
    MONTHNAME(order_date) AS month_name,
    COUNT(*) AS total_orders,
    AVG(amount) AS avg_order_value,
    SUM(amount) AS total_revenue,
    ROUND(
        SUM(amount) * 100.0 / SUM(SUM(amount)) OVER (), 
        2
    ) AS percent_of_annual
FROM orders
WHERE status = 'completed'
GROUP BY MONTH(order_date), MONTHNAME(order_date)
ORDER BY month_num;

-- 8. Customer segmentation
SELECT 
    CASE 
        WHEN rfm_score >= 9 THEN 'Champions'
        WHEN rfm_score >= 7 THEN 'Loyal Customers'
        WHEN rfm_score >= 5 THEN 'Potential Loyalists'
        WHEN rfm_score >= 3 THEN 'At Risk'
        ELSE 'Lost'
    END AS segment,
    COUNT(*) AS customers,
    AVG(total_orders) AS avg_orders,
    AVG(total_spent) AS avg_revenue,
    AVG(days_since_last) AS avg_recency
FROM (
    SELECT 
        customer_id,
        COUNT(*) AS total_orders,
        SUM(amount) AS total_spent,
        DATEDIFF(CURDATE(), MAX(order_date)) AS days_since_last,
        NTILE(3) OVER (ORDER BY DATEDIFF(CURDATE(), MAX(order_date))) AS recency_score,
        NTILE(3) OVER (ORDER BY COUNT(*)) AS frequency_score,
        NTILE(3) OVER (ORDER BY SUM(amount)) AS monetary_score,
        NTILE(3) OVER (ORDER BY DATEDIFF(CURDATE(), MAX(order_date))) +
        NTILE(3) OVER (ORDER BY COUNT(*)) +
        NTILE(3) OVER (ORDER BY SUM(amount)) AS rfm_score
    FROM orders
    WHERE status = 'completed'
    GROUP BY customer_id
) AS customer_rfm
GROUP BY segment
ORDER BY rfm_score DESC;

Additional Resources