GROUP BY & HAVING - Data Grouping Mastery
Learning Objectives
- Master GROUP BY fundamentals and execution order
- Group by single and multiple columns
- Understand the difference between WHERE and HAVING
- Use GROUP BY with aggregate functions
- Implement ROLLUP and CUBE for subtotals
- Apply GROUPING SETS for flexible grouping
- Handle NULL values in grouping
- Optimize GROUP BY performance
Understanding GROUP BY
GROUP BY transforms your data from individual rows into meaningful groups, enabling powerful data analysis and reporting! 📊
GROUP BY Golden Rules
Basic GROUP BY Operations
graph TB
GROUP[GROUP BY] --> SINGLE[Single Column]
GROUP --> MULTI[Multiple Columns]
GROUP --> EXPR[Expressions]
GROUP --> POS[Position]
SINGLE --> S1[Simple grouping]
MULTI --> M1[Hierarchical groups]
EXPR --> E1[Calculated groups]
POS --> P1[GROUP BY 1, 2]
style GROUP fill:#dcfce7
style SINGLE fill:#dbeafe
style MULTI fill:#fef3c7
style EXPR fill:#fee2e2
GROUP BY Fundamentals
-- ========================================
-- SINGLE COLUMN GROUPING
-- ========================================
-- Basic GROUP BY with COUNT
SELECT
category,
COUNT(*) AS product_count
FROM products
GROUP BY category;
-- GROUP BY with multiple aggregates
SELECT
customer_id,
COUNT(*) AS order_count,
SUM(amount) AS total_spent,
AVG(amount) AS avg_order_value,
MIN(order_date) AS first_order,
MAX(order_date) AS last_order
FROM orders
GROUP BY customer_id;
-- GROUP BY with JOIN
SELECT
c.country,
COUNT(DISTINCT c.customer_id) AS customers,
COUNT(o.order_id) AS orders,
SUM(o.amount) AS revenue
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.country
ORDER BY revenue DESC;
-- ========================================
-- MULTIPLE COLUMN GROUPING
-- ========================================
-- Hierarchical grouping
SELECT
YEAR(order_date) AS year,
MONTH(order_date) AS month,
category,
COUNT(*) AS sales_count,
SUM(amount) AS revenue
FROM orders o
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p ON oi.product_id = p.product_id
GROUP BY
YEAR(order_date),
MONTH(order_date),
category
ORDER BY year DESC, month DESC, revenue DESC;
-- Multi-level aggregation
SELECT
country,
state,
city,
COUNT(*) AS customer_count,
AVG(lifetime_value) AS avg_ltv
FROM customers
GROUP BY country, state, city
WITH ROLLUP; -- Adds subtotals
-- ========================================
-- GROUP BY EXPRESSIONS
-- ========================================
-- Group by calculated values
SELECT
CASE
WHEN amount < 100 THEN 'Small'
WHEN amount < 500 THEN 'Medium'
WHEN amount < 1000 THEN 'Large'
ELSE 'Enterprise'
END AS order_size,
COUNT(*) AS order_count,
AVG(amount) AS avg_amount
FROM orders
GROUP BY
CASE
WHEN amount < 100 THEN 'Small'
WHEN amount < 500 THEN 'Medium'
WHEN amount < 1000 THEN 'Large'
ELSE 'Enterprise'
END;
-- Group by date parts
SELECT
DATE_FORMAT(order_date, '%Y-%m') AS month,
DAYOFWEEK(order_date) AS day_of_week,
CASE DAYOFWEEK(order_date)
WHEN 1 THEN 'Sunday'
WHEN 2 THEN 'Monday'
WHEN 3 THEN 'Tuesday'
WHEN 4 THEN 'Wednesday'
WHEN 5 THEN 'Thursday'
WHEN 6 THEN 'Friday'
WHEN 7 THEN 'Saturday'
END AS day_name,
COUNT(*) AS orders,
SUM(amount) AS revenue
FROM orders
GROUP BY
DATE_FORMAT(order_date, '%Y-%m'),
DAYOFWEEK(order_date)
ORDER BY month, day_of_week;
-- ========================================
-- GROUP BY WITH NULL HANDLING
-- ========================================
-- NULLs form their own group
SELECT
COALESCE(category, 'Uncategorized') AS category_name,
COUNT(*) AS product_count,
AVG(price) AS avg_price
FROM products
GROUP BY category; -- NULL category becomes a group
-- Exclude NULL groups
SELECT
category,
COUNT(*) AS count
FROM products
WHERE category IS NOT NULL
GROUP BY category;
-- ========================================
-- GROUP BY POSITION (Less Readable)
-- ========================================
-- Group by column position
SELECT
category,
brand,
COUNT(*) AS count
FROM products
GROUP BY 1, 2; -- Groups by first and second column
-- Better to use column names for clarity
SELECT
category,
brand,
COUNT(*) AS count
FROM products
GROUP BY category, brand;
HAVING Clause - Filtering Groups
HAVING Clause Examples
-- ========================================
-- BASIC HAVING EXAMPLES
-- ========================================
-- Filter groups by count
SELECT
category,
COUNT(*) AS product_count
FROM products
GROUP BY category
HAVING COUNT(*) > 10;
-- Multiple conditions in HAVING
SELECT
customer_id,
COUNT(*) AS order_count,
SUM(amount) AS total_spent,
AVG(amount) AS avg_order
FROM orders
GROUP BY customer_id
HAVING
COUNT(*) >= 5
AND SUM(amount) > 1000
AND AVG(amount) > 100;
-- ========================================
-- WHERE AND HAVING TOGETHER
-- ========================================
-- Best practice: Filter early with WHERE
SELECT
c.country,
COUNT(DISTINCT c.customer_id) AS customers,
SUM(o.amount) AS revenue
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id
WHERE
o.status = 'completed' -- WHERE: filter individual orders
AND o.order_date >= '2024-01-01'
GROUP BY c.country
HAVING
COUNT(DISTINCT c.customer_id) > 100 -- HAVING: filter country groups
AND SUM(o.amount) > 10000
ORDER BY revenue DESC;
-- ========================================
-- HAVING WITH AGGREGATE COMPARISONS
-- ========================================
-- Groups above average
SELECT
category,
AVG(price) AS category_avg_price
FROM products
GROUP BY category
HAVING AVG(price) > (
SELECT AVG(price) FROM products
);
-- Relative comparisons
SELECT
salesperson_id,
SUM(sale_amount) AS total_sales
FROM sales
WHERE YEAR(sale_date) = 2024
GROUP BY salesperson_id
HAVING SUM(sale_amount) > (
SELECT AVG(total_sales) * 1.2 -- 20% above average
FROM (
SELECT SUM(sale_amount) AS total_sales
FROM sales
WHERE YEAR(sale_date) = 2024
GROUP BY salesperson_id
) AS avg_calc
);
-- ========================================
-- HAVING WITHOUT GROUP BY
-- ========================================
-- HAVING can be used without GROUP BY (treats all rows as one group)
SELECT
COUNT(*) AS total_products,
AVG(price) AS avg_price
FROM products
HAVING AVG(price) > 50; -- Filters the single group
-- ========================================
-- COMPLEX HAVING PATTERNS
-- ========================================
-- Find duplicate records
SELECT
email,
COUNT(*) AS duplicate_count
FROM customers
GROUP BY email
HAVING COUNT(*) > 1;
-- Categories with diverse price ranges
SELECT
category,
COUNT(*) AS products,
MIN(price) AS min_price,
MAX(price) AS max_price,
MAX(price) - MIN(price) AS price_range
FROM products
GROUP BY category
HAVING
MAX(price) - MIN(price) > 500
AND COUNT(*) > 5
ORDER BY price_range DESC;
-- Customers with irregular ordering patterns
SELECT
customer_id,
COUNT(*) AS order_count,
STD(amount) AS order_std_dev,
AVG(amount) AS avg_amount
FROM orders
GROUP BY customer_id
HAVING
STD(amount) > AVG(amount) * 0.5 -- High variation
AND COUNT(*) > 10;
-- Time-based group filtering
SELECT
DATE_FORMAT(order_date, '%Y-%m') AS month,
COUNT(*) AS orders,
SUM(amount) AS revenue
FROM orders
GROUP BY DATE_FORMAT(order_date, '%Y-%m')
HAVING
COUNT(*) > (
SELECT AVG(monthly_orders)
FROM (
SELECT COUNT(*) AS monthly_orders
FROM orders
GROUP BY DATE_FORMAT(order_date, '%Y-%m')
) AS monthly_avg
);
Advanced Grouping: ROLLUP, CUBE, and GROUPING SETS
MySQL Support Note
Advanced Grouping Techniques
-- ========================================
-- ROLLUP - Hierarchical Subtotals
-- ========================================
-- Simple ROLLUP
SELECT
category,
COUNT(*) AS products,
SUM(stock * price) AS inventory_value
FROM products
GROUP BY category WITH ROLLUP;
-- Multi-level ROLLUP
SELECT
COALESCE(YEAR(order_date), 'Grand Total') AS year,
COALESCE(MONTH(order_date), 'Year Total') AS month,
COALESCE(category, 'Month Total') AS category,
COUNT(*) AS orders,
SUM(amount) AS revenue
FROM orders o
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p ON oi.product_id = p.product_id
WHERE o.status = 'completed'
GROUP BY
YEAR(order_date),
MONTH(order_date),
category
WITH ROLLUP;
-- ROLLUP with GROUPING() function
SELECT
category,
brand,
COUNT(*) AS count,
SUM(price) AS total_price,
CASE
WHEN GROUPING(category) = 1 THEN 'GRAND TOTAL'
WHEN GROUPING(brand) = 1 THEN 'Category Subtotal'
ELSE 'Detail'
END AS level_type
FROM products
GROUP BY category, brand WITH ROLLUP;
-- ========================================
-- CUBE Alternative (MySQL doesn't support CUBE)
-- ========================================
-- Simulate CUBE with UNION
-- CUBE would give all possible grouping combinations
-- All combinations for (A, B)
-- 1. GROUP BY A, B
-- 2. GROUP BY A
-- 3. GROUP BY B
-- 4. Grand total
(SELECT
region,
product_category,
SUM(sales_amount) AS total_sales,
'Region-Category' AS grouping_level
FROM sales
GROUP BY region, product_category)
UNION ALL
(SELECT
region,
NULL AS product_category,
SUM(sales_amount) AS total_sales,
'Region Only' AS grouping_level
FROM sales
GROUP BY region)
UNION ALL
(SELECT
NULL AS region,
product_category,
SUM(sales_amount) AS total_sales,
'Category Only' AS grouping_level
FROM sales
GROUP BY product_category)
UNION ALL
(SELECT
NULL AS region,
NULL AS product_category,
SUM(sales_amount) AS total_sales,
'Grand Total' AS grouping_level
FROM sales)
ORDER BY
COALESCE(region, 'ZZZZZ'),
COALESCE(product_category, 'ZZZZZ');
-- ========================================
-- GROUPING SETS Alternative (MySQL workaround)
-- ========================================
-- Simulate GROUPING SETS with UNION
-- Different grouping combinations as needed
(SELECT
'By Year-Month' AS group_type,
YEAR(order_date) AS year,
MONTH(order_date) AS month,
NULL AS category,
COUNT(*) AS orders,
SUM(amount) AS revenue
FROM orders
GROUP BY YEAR(order_date), MONTH(order_date))
UNION ALL
(SELECT
'By Category' AS group_type,
NULL AS year,
NULL AS month,
category,
COUNT(*) AS orders,
SUM(amount) AS revenue
FROM orders o
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p ON oi.product_id = p.product_id
GROUP BY category)
UNION ALL
(SELECT
'Grand Total' AS group_type,
NULL AS year,
NULL AS month,
NULL AS category,
COUNT(*) AS orders,
SUM(amount) AS revenue
FROM orders)
ORDER BY group_type, year, month, category;
-- ========================================
-- Practical ROLLUP Applications
-- ========================================
-- Sales hierarchy report
SELECT
COALESCE(region, 'ALL REGIONS') AS region,
COALESCE(country, 'All Countries') AS country,
COALESCE(state, 'All States') AS state,
COUNT(DISTINCT customer_id) AS customers,
COUNT(order_id) AS orders,
FORMAT(SUM(amount), 2) AS revenue,
FORMAT(AVG(amount), 2) AS avg_order_value
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
WHERE o.status = 'completed'
GROUP BY region, country, state WITH ROLLUP
HAVING region IS NOT NULL OR (region IS NULL AND country IS NULL AND state IS NULL);
-- Inventory summary with subtotals
SELECT
COALESCE(warehouse, 'ALL WAREHOUSES') AS warehouse,
COALESCE(category, 'All Categories') AS category,
COUNT(DISTINCT product_id) AS products,
SUM(quantity_on_hand) AS total_units,
FORMAT(SUM(quantity_on_hand * unit_cost), 2) AS inventory_value,
CASE
WHEN GROUPING(warehouse) = 1 THEN '*** GRAND TOTAL ***'
WHEN GROUPING(category) = 1 THEN '** Warehouse Total **'
ELSE ''
END AS summary_level
FROM inventory
GROUP BY warehouse, category WITH ROLLUP;
GROUP BY Performance Optimization
Optimizing GROUP BY Queries
-- ========================================
-- INDEX OPTIMIZATION FOR GROUP BY
-- ========================================
-- Create index for GROUP BY columns
CREATE INDEX idx_groupby ON orders(customer_id, status, order_date);
-- Query uses index for grouping
EXPLAIN SELECT
customer_id,
status,
COUNT(*),
SUM(amount)
FROM orders
GROUP BY customer_id, status;
-- Covering index includes all needed columns
CREATE INDEX idx_covering ON orders(
customer_id,
status,
amount
);
-- ========================================
-- OPTIMIZE GROUP BY ORDER
-- ========================================
-- Match GROUP BY order to index order
-- Good: Matches index (customer_id, status, order_date)
SELECT customer_id, status, DATE(order_date), COUNT(*)
FROM orders
GROUP BY customer_id, status, DATE(order_date);
-- Bad: Different order than index
SELECT DATE(order_date), customer_id, status, COUNT(*)
FROM orders
GROUP BY DATE(order_date), customer_id, status;
-- ========================================
-- REDUCE GROUPED DATA VOLUME
-- ========================================
-- Filter before grouping (use WHERE not HAVING when possible)
-- Efficient:
SELECT category, COUNT(*)
FROM products
WHERE price > 100 -- Filters first
GROUP BY category;
-- Inefficient:
SELECT category, COUNT(*)
FROM products
GROUP BY category
HAVING AVG(price) > 100; -- Filters after grouping
-- ========================================
-- SUMMARY TABLES FOR PERFORMANCE
-- ========================================
-- Create materialized summary table
CREATE TABLE daily_sales_summary (
sale_date DATE PRIMARY KEY,
total_orders INT,
total_revenue DECIMAL(10,2),
unique_customers INT,
INDEX idx_date (sale_date)
);
-- Populate summary table (run periodically)
INSERT INTO daily_sales_summary
SELECT
DATE(order_date),
COUNT(*),
SUM(amount),
COUNT(DISTINCT customer_id)
FROM orders
WHERE DATE(order_date) = CURDATE() - INTERVAL 1 DAY
GROUP BY DATE(order_date)
ON DUPLICATE KEY UPDATE
total_orders = VALUES(total_orders),
total_revenue = VALUES(total_revenue),
unique_customers = VALUES(unique_customers);
-- ========================================
-- PARTITION OPTIMIZATION
-- ========================================
-- Partition large tables for better GROUP BY performance
ALTER TABLE orders
PARTITION BY RANGE (YEAR(order_date)) (
PARTITION p2022 VALUES LESS THAN (2023),
PARTITION p2023 VALUES LESS THAN (2024),
PARTITION p2024 VALUES LESS THAN (2025),
PARTITION p_future VALUES LESS THAN MAXVALUE
);
-- ========================================
-- MONITORING GROUP BY PERFORMANCE
-- ========================================
-- Check temporary table usage
SHOW STATUS LIKE 'Created_tmp%';
-- Analyze slow GROUP BY queries
EXPLAIN FORMAT=JSON
SELECT
customer_id,
YEAR(order_date) AS year,
COUNT(*) AS orders,
SUM(amount) AS revenue
FROM orders
GROUP BY customer_id, YEAR(order_date)
HAVING orders > 10;
Practice Exercise
GROUP BY Challenges
💡 Click for Solutions
-- 1. Top 3 customers by revenue in each country
WITH customer_rankings AS (
SELECT
c.country,
c.customer_id,
c.customer_name,
SUM(o.amount) AS total_revenue,
ROW_NUMBER() OVER (
PARTITION BY c.country
ORDER BY SUM(o.amount) DESC
) AS rank_in_country
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id
WHERE o.status = 'completed'
GROUP BY c.country, c.customer_id
)
SELECT *
FROM customer_rankings
WHERE rank_in_country <= 3
ORDER BY country, rank_in_country;
-- 2. Year-over-year growth by category
WITH yearly_sales AS (
SELECT
p.category,
YEAR(o.order_date) AS year,
SUM(oi.quantity * oi.unit_price) AS revenue
FROM orders o
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p ON oi.product_id = p.product_id
WHERE o.status = 'completed'
GROUP BY p.category, YEAR(o.order_date)
)
SELECT
category,
year,
revenue,
LAG(revenue) OVER (PARTITION BY category ORDER BY year) AS prev_year_revenue,
ROUND(
(revenue - LAG(revenue) OVER (PARTITION BY category ORDER BY year)) /
LAG(revenue) OVER (PARTITION BY category ORDER BY year) * 100,
2
) AS yoy_growth_percent
FROM yearly_sales
ORDER BY category, year;
-- 3. Products that sell better on weekends
SELECT
p.product_id,
p.product_name,
SUM(CASE WHEN DAYOFWEEK(o.order_date) IN (1, 7) THEN oi.quantity ELSE 0 END) AS weekend_sales,
SUM(CASE WHEN DAYOFWEEK(o.order_date) NOT IN (1, 7) THEN oi.quantity ELSE 0 END) AS weekday_sales,
ROUND(
SUM(CASE WHEN DAYOFWEEK(o.order_date) IN (1, 7) THEN oi.quantity ELSE 0 END) * 100.0 /
SUM(oi.quantity), 2
) AS weekend_percentage
FROM products p
JOIN order_items oi ON p.product_id = oi.product_id
JOIN orders o ON oi.order_id = o.order_id
GROUP BY p.product_id
HAVING weekend_percentage > 40 -- More than 40% of sales on weekends
ORDER BY weekend_percentage DESC;
-- 4. Cohort analysis by signup month
SELECT
DATE_FORMAT(c.created_at, '%Y-%m') AS cohort_month,
DATE_FORMAT(o.order_date, '%Y-%m') AS order_month,
PERIOD_DIFF(
DATE_FORMAT(o.order_date, '%Y%m'),
DATE_FORMAT(c.created_at, '%Y%m')
) AS months_since_signup,
COUNT(DISTINCT c.customer_id) AS customers,
COUNT(o.order_id) AS orders,
SUM(o.amount) AS revenue
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
GROUP BY cohort_month, order_month
ORDER BY cohort_month, months_since_signup;
-- 5. Categories where all products are above average
SELECT
category,
COUNT(*) AS product_count,
MIN(price) AS min_price,
AVG(price) AS avg_price
FROM products
GROUP BY category
HAVING MIN(price) > (SELECT AVG(price) FROM products);
-- 6. P&L summary with ROLLUP
SELECT
COALESCE(YEAR(order_date), 'TOTAL') AS year,
COALESCE(QUARTER(order_date), 'Year Total') AS quarter,
COALESCE(MONTH(order_date), 'Quarter Total') AS month,
COUNT(*) AS transactions,
SUM(revenue) AS gross_revenue,
SUM(cost) AS total_cost,
SUM(revenue - cost) AS gross_profit,
ROUND(SUM(revenue - cost) / SUM(revenue) * 100, 2) AS margin_percent
FROM (
SELECT
o.order_date,
oi.quantity * oi.unit_price AS revenue,
oi.quantity * p.cost AS cost
FROM orders o
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p ON oi.product_id = p.product_id
WHERE o.status = 'completed'
) AS order_details
GROUP BY YEAR(order_date), QUARTER(order_date), MONTH(order_date) WITH ROLLUP;
-- 7. Seasonal patterns
SELECT
MONTH(order_date) AS month_num,
MONTHNAME(order_date) AS month_name,
COUNT(*) AS total_orders,
AVG(amount) AS avg_order_value,
SUM(amount) AS total_revenue,
ROUND(
SUM(amount) * 100.0 / SUM(SUM(amount)) OVER (),
2
) AS percent_of_annual
FROM orders
WHERE status = 'completed'
GROUP BY MONTH(order_date), MONTHNAME(order_date)
ORDER BY month_num;
-- 8. Customer segmentation
SELECT
CASE
WHEN rfm_score >= 9 THEN 'Champions'
WHEN rfm_score >= 7 THEN 'Loyal Customers'
WHEN rfm_score >= 5 THEN 'Potential Loyalists'
WHEN rfm_score >= 3 THEN 'At Risk'
ELSE 'Lost'
END AS segment,
COUNT(*) AS customers,
AVG(total_orders) AS avg_orders,
AVG(total_spent) AS avg_revenue,
AVG(days_since_last) AS avg_recency
FROM (
SELECT
customer_id,
COUNT(*) AS total_orders,
SUM(amount) AS total_spent,
DATEDIFF(CURDATE(), MAX(order_date)) AS days_since_last,
NTILE(3) OVER (ORDER BY DATEDIFF(CURDATE(), MAX(order_date))) AS recency_score,
NTILE(3) OVER (ORDER BY COUNT(*)) AS frequency_score,
NTILE(3) OVER (ORDER BY SUM(amount)) AS monetary_score,
NTILE(3) OVER (ORDER BY DATEDIFF(CURDATE(), MAX(order_date))) +
NTILE(3) OVER (ORDER BY COUNT(*)) +
NTILE(3) OVER (ORDER BY SUM(amount)) AS rfm_score
FROM orders
WHERE status = 'completed'
GROUP BY customer_id
) AS customer_rfm
GROUP BY segment
ORDER BY rfm_score DESC;