HAVING Clause Deep Dive
Learning Objectives
- Master HAVING clause mechanics and execution timing
- Understand HAVING vs WHERE in depth
- Use complex aggregate conditions in HAVING
- Implement subqueries within HAVING
- Apply HAVING with window functions
- Optimize HAVING clause performance
- Handle edge cases and common pitfalls
- Build complex analytical queries with HAVING
HAVING Clause Mechanics
HAVING is SQL's post-grouping filter, allowing you to filter aggregated results after GROUP BY has created groups! 🎯
HAVING Key Concepts
HAVING vs WHERE: Deep Comparison
graph TB
FILTER[Filtering Data] --> WHERE[WHERE Clause]
FILTER --> HAVING[HAVING Clause]
WHERE --> W1[Pre-aggregation]
WHERE --> W2[Row-level]
WHERE --> W3[No aggregates]
WHERE --> W4[More efficient]
HAVING --> H1[Post-aggregation]
HAVING --> H2[Group-level]
HAVING --> H3[Uses aggregates]
HAVING --> H4[Processes less data]
style FILTER fill:#dcfce7
style WHERE fill:#dbeafe
style HAVING fill:#fef3c7
WHERE vs HAVING Examples
-- ========================================
-- COMPARING WHERE AND HAVING
-- ========================================
-- Wrong: Cannot use aggregate in WHERE
-- This will cause an error:
-- SELECT category, COUNT(*)
-- FROM products
-- WHERE COUNT(*) > 10 -- ERROR!
-- GROUP BY category;
-- Correct: Use HAVING for aggregate conditions
SELECT category, COUNT(*)
FROM products
GROUP BY category
HAVING COUNT(*) > 10;
-- ========================================
-- OPTIMAL FILTERING STRATEGY
-- ========================================
-- Inefficient: Filtering after grouping
SELECT
customer_id,
COUNT(*) as order_count,
SUM(amount) as total_spent
FROM orders
GROUP BY customer_id
HAVING customer_id IN (
SELECT customer_id
FROM customers
WHERE country = 'USA'
);
-- Efficient: Filter before grouping
SELECT
o.customer_id,
COUNT(*) as order_count,
SUM(o.amount) as total_spent
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
WHERE c.country = 'USA' -- Filter early!
GROUP BY o.customer_id;
-- ========================================
-- COMBINED WHERE AND HAVING
-- ========================================
-- Complex filtering at both levels
SELECT
DATE_FORMAT(o.order_date, '%Y-%m') as month,
c.customer_segment,
COUNT(DISTINCT o.customer_id) as unique_customers,
COUNT(o.order_id) as total_orders,
SUM(o.amount) as revenue,
AVG(o.amount) as avg_order_value
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
WHERE
-- Row-level filters (WHERE)
o.status = 'completed'
AND o.order_date >= '2024-01-01'
AND c.is_active = TRUE
GROUP BY
DATE_FORMAT(o.order_date, '%Y-%m'),
c.customer_segment
HAVING
-- Group-level filters (HAVING)
COUNT(DISTINCT o.customer_id) >= 10 -- At least 10 customers
AND SUM(o.amount) > 5000 -- Revenue threshold
AND AVG(o.amount) > 100 -- Average order threshold
ORDER BY month DESC, revenue DESC;
-- ========================================
-- EDGE CASE: Column in both WHERE and HAVING
-- ========================================
-- You can reference grouped columns in HAVING
SELECT
category,
brand,
COUNT(*) as product_count,
AVG(price) as avg_price
FROM products
WHERE
category IN ('Electronics', 'Computers') -- Pre-filter categories
GROUP BY category, brand
HAVING
category = 'Electronics' -- Valid! category is in GROUP BY
AND COUNT(*) > 5
AND AVG(price) > 500;
Complex HAVING Conditions
Advanced HAVING Patterns
-- ========================================
-- HAVING WITH COMPLEX AGGREGATES
-- ========================================
-- Multiple aggregate conditions with calculations
SELECT
product_category,
supplier_id,
COUNT(*) as product_count,
AVG(price) as avg_price,
MIN(price) as min_price,
MAX(price) as max_price,
MAX(price) - MIN(price) as price_range,
STD(price) as price_std_dev
FROM products
GROUP BY product_category, supplier_id
HAVING
COUNT(*) >= 5 -- Minimum products
AND MAX(price) - MIN(price) > AVG(price) * 0.5 -- Significant range
AND STD(price) > 10 -- Price variation
AND AVG(price) BETWEEN 50 AND 500; -- Price band
-- ========================================
-- HAVING WITH CONDITIONAL AGGREGATES
-- ========================================
-- Using CASE within aggregates in HAVING
SELECT
customer_id,
COUNT(*) as total_orders,
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed_orders,
SUM(CASE WHEN status = 'cancelled' THEN 1 ELSE 0 END) as cancelled_orders,
SUM(amount) as total_amount
FROM orders
GROUP BY customer_id
HAVING
-- More than 20% cancellation rate
SUM(CASE WHEN status = 'cancelled' THEN 1 ELSE 0 END) / COUNT(*) > 0.2
-- And significant order volume
AND COUNT(*) >= 10
-- But still profitable
AND SUM(CASE WHEN status = 'completed' THEN amount ELSE 0 END) > 1000;
-- ========================================
-- HAVING WITH SUBQUERIES
-- ========================================
-- Compare groups to overall statistics
SELECT
department,
COUNT(*) as employee_count,
AVG(salary) as avg_salary,
SUM(salary) as total_payroll
FROM employees
GROUP BY department
HAVING
-- Department average above company average
AVG(salary) > (SELECT AVG(salary) FROM employees)
-- And represents significant portion of payroll
AND SUM(salary) > (SELECT SUM(salary) * 0.1 FROM employees);
-- Dynamic thresholds with subqueries
SELECT
product_id,
COUNT(*) as review_count,
AVG(rating) as avg_rating
FROM reviews
GROUP BY product_id
HAVING
-- Above median review count
COUNT(*) > (
SELECT AVG(review_count)
FROM (
SELECT COUNT(*) as review_count
FROM reviews
GROUP BY product_id
) as counts
)
-- And rating in top quartile
AND AVG(rating) > (
SELECT DISTINCT rating
FROM reviews
ORDER BY rating DESC
LIMIT 1 OFFSET (SELECT COUNT(DISTINCT rating) * 0.25 FROM reviews)
);
-- ========================================
-- HAVING WITH CORRELATED SUBQUERIES
-- ========================================
-- Find categories performing better than their historical average
SELECT
category,
YEAR(order_date) as year,
MONTH(order_date) as month,
SUM(amount) as monthly_revenue
FROM orders o
JOIN products p ON o.product_id = p.product_id
GROUP BY category, YEAR(order_date), MONTH(order_date)
HAVING
SUM(amount) > (
-- Historical average for this category
SELECT AVG(monthly_rev)
FROM (
SELECT SUM(amount) as monthly_rev
FROM orders o2
JOIN products p2 ON o2.product_id = p2.product_id
WHERE p2.category = p.category
AND o2.order_date < DATE_SUB(LAST_DAY(CONCAT(year, '-', month, '-01')), INTERVAL 0 DAY)
GROUP BY YEAR(o2.order_date), MONTH(o2.order_date)
) as historical
) * 1.1; -- 10% above historical average
HAVING Performance Optimization
Performance Considerations
Optimizing HAVING Queries
-- ========================================
-- OPTIMIZATION STRATEGIES
-- ========================================
-- Strategy 1: Filter early with WHERE
-- Bad: HAVING does unnecessary grouping
SELECT
customer_id,
COUNT(*),
SUM(amount)
FROM orders
GROUP BY customer_id
HAVING customer_id IN (1, 2, 3, 4, 5); -- Should be WHERE
-- Good: WHERE filters before grouping
SELECT
customer_id,
COUNT(*),
SUM(amount)
FROM orders
WHERE customer_id IN (1, 2, 3, 4, 5) -- Filter first
GROUP BY customer_id;
-- ========================================
-- Strategy 2: Use indexes for GROUP BY
-- ========================================
-- Create covering index for GROUP BY + aggregates
CREATE INDEX idx_orders_customer_amount
ON orders(customer_id, status, amount);
-- This query can use index only
SELECT
customer_id,
COUNT(*) as order_count,
SUM(amount) as total
FROM orders
WHERE status = 'completed'
GROUP BY customer_id
HAVING COUNT(*) > 10;
-- ========================================
-- Strategy 3: Materialized aggregates
-- ========================================
-- Create summary table for frequent HAVING queries
CREATE TABLE customer_summary (
customer_id INT PRIMARY KEY,
total_orders INT,
total_amount DECIMAL(10,2),
avg_order_value DECIMAL(10,2),
last_order_date DATE,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
INDEX idx_metrics (total_orders, total_amount)
);
-- Populate summary (run periodically)
INSERT INTO customer_summary
SELECT
customer_id,
COUNT(*),
SUM(amount),
AVG(amount),
MAX(order_date),
NOW()
FROM orders
WHERE status = 'completed'
GROUP BY customer_id
ON DUPLICATE KEY UPDATE
total_orders = VALUES(total_orders),
total_amount = VALUES(total_amount),
avg_order_value = VALUES(avg_order_value),
last_order_date = VALUES(last_order_date);
-- Now HAVING queries are instant
SELECT *
FROM customer_summary
WHERE total_orders > 10 -- No HAVING needed!
AND total_amount > 1000;
-- ========================================
-- Strategy 4: Optimize complex HAVING
-- ========================================
-- Inefficient: Multiple passes for statistics
SELECT
category,
COUNT(*) as cnt,
AVG(price) as avg_price,
STD(price) as std_price
FROM products
GROUP BY category
HAVING
STD(price) / AVG(price) > 0.5 -- Coefficient of variation
AND COUNT(*) > 10;
-- Efficient: Calculate once, filter once
SELECT *
FROM (
SELECT
category,
COUNT(*) as cnt,
AVG(price) as avg_price,
STD(price) as std_price,
STD(price) / AVG(price) as coeff_variation
FROM products
GROUP BY category
) as stats
WHERE coeff_variation > 0.5
AND cnt > 10;
-- ========================================
-- Strategy 5: Partition elimination
-- ========================================
-- If table is partitioned by date
ALTER TABLE orders
PARTITION BY RANGE (YEAR(order_date)) (
PARTITION p2023 VALUES LESS THAN (2024),
PARTITION p2024 VALUES LESS THAN (2025),
PARTITION p2025 VALUES LESS THAN (2026)
);
-- This query only scans relevant partitions
SELECT
customer_id,
COUNT(*) as orders_2024
FROM orders
WHERE order_date >= '2024-01-01'
AND order_date < '2025-01-01'
GROUP BY customer_id
HAVING COUNT(*) > 50;
Common HAVING Pitfalls and Solutions
Avoiding HAVING Mistakes
-- ========================================
-- PITFALL 1: Column Aliases in HAVING
-- ========================================
-- MySQL allows this (non-standard):
SELECT
customer_id,
COUNT(*) as order_count
FROM orders
GROUP BY customer_id
HAVING order_count > 10; -- Works in MySQL
-- Standard SQL requires:
SELECT
customer_id,
COUNT(*) as order_count
FROM orders
GROUP BY customer_id
HAVING COUNT(*) > 10; -- Portable
-- ========================================
-- PITFALL 2: NULL Handling in HAVING
-- ========================================
-- NULLs can cause unexpected results
SELECT
category,
AVG(discount) as avg_discount
FROM products
GROUP BY category
HAVING AVG(discount) > 10; -- Excludes NULL discount groups
-- Handle NULLs explicitly
SELECT
category,
AVG(COALESCE(discount, 0)) as avg_discount,
COUNT(discount) as products_with_discount
FROM products
GROUP BY category
HAVING AVG(COALESCE(discount, 0)) > 10
OR COUNT(discount) = 0; -- Include all-NULL groups
-- ========================================
-- PITFALL 3: HAVING Without GROUP BY
-- ========================================
-- Valid but treats entire result as one group
SELECT
COUNT(*) as total,
AVG(price) as avg_price
FROM products
HAVING AVG(price) > 100; -- Filters the single group
-- ========================================
-- PITFALL 4: Mixing WHERE and HAVING Logic
-- ========================================
-- Confusing: Mixes row and group logic
SELECT
category,
COUNT(*) as product_count
FROM products
GROUP BY category
HAVING COUNT(*) > 5
AND category != 'Discontinued'; -- Should be WHERE
-- Clear: Separate row and group filters
SELECT
category,
COUNT(*) as product_count
FROM products
WHERE category != 'Discontinued' -- Row filter
GROUP BY category
HAVING COUNT(*) > 5; -- Group filter
-- ========================================
-- PITFALL 5: Incorrect Aggregate Scope
-- ========================================
-- Wrong: Trying to filter individual rows in HAVING
SELECT
customer_id,
order_id, -- Not in GROUP BY!
SUM(amount)
FROM orders
GROUP BY customer_id
HAVING amount > 100; -- Error: amount not aggregated
-- Correct: Aggregate or include in GROUP BY
SELECT
customer_id,
COUNT(*) as order_count,
SUM(amount) as total
FROM orders
WHERE amount > 100 -- Filter individual orders
GROUP BY customer_id
HAVING SUM(amount) > 1000; -- Filter grouped totals
Practice Exercise
Advanced HAVING Challenges
💡 Click for Solutions
-- 1. Customer segments where average exceeds median
WITH segment_stats AS (
SELECT
customer_segment,
AVG(order_value) as avg_value,
COUNT(*) as order_count
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
GROUP BY customer_segment
),
median_calc AS (
SELECT
customer_segment,
order_value,
ROW_NUMBER() OVER (PARTITION BY customer_segment ORDER BY order_value) as rn,
COUNT(*) OVER (PARTITION BY customer_segment) as cnt
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
)
SELECT
s.customer_segment,
s.avg_value,
m.median_value,
s.order_count
FROM segment_stats s
JOIN (
SELECT
customer_segment,
AVG(order_value) as median_value
FROM median_calc
WHERE rn IN (FLOOR((cnt + 1) / 2), CEIL((cnt + 1) / 2))
GROUP BY customer_segment
) m ON s.customer_segment = m.customer_segment
WHERE s.avg_value > m.median_value * 1.2;
-- 2. Products with abnormal return rates
SELECT
product_id,
product_name,
COUNT(*) as total_sales,
SUM(CASE WHEN returned = TRUE THEN 1 ELSE 0 END) as returns,
AVG(CASE WHEN returned = TRUE THEN 1 ELSE 0 END) as return_rate,
STD(CASE WHEN returned = TRUE THEN 1 ELSE 0 END) as return_std
FROM order_items oi
JOIN products p ON oi.product_id = p.product_id
GROUP BY product_id
HAVING
COUNT(*) >= 20 -- Minimum sample size
AND AVG(CASE WHEN returned = TRUE THEN 1 ELSE 0 END) > (
SELECT AVG(return_rate) + 2 * STD(return_rate)
FROM (
SELECT AVG(CASE WHEN returned = TRUE THEN 1 ELSE 0 END) as return_rate
FROM order_items
GROUP BY product_id
) as rates
);
-- 3. Months exceeding both previous and year-ago
WITH monthly_sales AS (
SELECT
DATE_FORMAT(order_date, '%Y-%m') as month,
SUM(amount) as revenue
FROM orders
WHERE status = 'completed'
GROUP BY DATE_FORMAT(order_date, '%Y-%m')
)
SELECT
month,
revenue,
LAG(revenue, 1) OVER (ORDER BY month) as prev_month,
LAG(revenue, 12) OVER (ORDER BY month) as year_ago
FROM monthly_sales
HAVING
revenue > LAG(revenue, 1) OVER (ORDER BY month) * 1.05
AND revenue > LAG(revenue, 12) OVER (ORDER BY month) * 1.10;
-- 4. High price variance but stable sales
SELECT
category,
COUNT(DISTINCT product_id) as products,
STD(price) as price_std,
AVG(price) as price_avg,
STD(monthly_sales) as sales_std,
AVG(monthly_sales) as sales_avg,
STD(price) / AVG(price) as price_cv,
STD(monthly_sales) / AVG(monthly_sales) as sales_cv
FROM (
SELECT
p.category,
p.product_id,
p.price,
DATE_FORMAT(o.order_date, '%Y-%m') as month,
SUM(oi.quantity) as monthly_sales
FROM products p
JOIN order_items oi ON p.product_id = oi.product_id
JOIN orders o ON oi.order_id = o.order_id
GROUP BY p.category, p.product_id, month
) as product_sales
GROUP BY category
HAVING
STD(price) / AVG(price) > 0.5 -- High price variance
AND STD(monthly_sales) / AVG(monthly_sales) < 0.2; -- Low sales variance
-- 5. Suppliers with consistent rating improvement
WITH supplier_ratings AS (
SELECT
s.supplier_id,
s.supplier_name,
QUARTER(r.review_date) as quarter,
YEAR(r.review_date) as year,
AVG(r.rating) as avg_rating,
ROW_NUMBER() OVER (PARTITION BY s.supplier_id ORDER BY YEAR(r.review_date), QUARTER(r.review_date)) as quarter_num
FROM suppliers s
JOIN products p ON s.supplier_id = p.supplier_id
JOIN reviews r ON p.product_id = r.product_id
GROUP BY s.supplier_id, year, quarter
)
SELECT
supplier_id,
supplier_name,
COUNT(*) as quarters_tracked,
MIN(avg_rating) as min_rating,
MAX(avg_rating) as max_rating
FROM supplier_ratings
GROUP BY supplier_id
HAVING
COUNT(*) >= 4 -- At least 4 quarters
AND MIN(CASE WHEN quarter_num > 1 THEN avg_rating END) >
MIN(CASE WHEN quarter_num = 1 THEN avg_rating END);
-- 6. Customers with irregular patterns
SELECT
customer_id,
COUNT(*) as order_count,
AVG(DATEDIFF(order_date, LAG(order_date) OVER (PARTITION BY customer_id ORDER BY order_date))) as avg_days_between,
STD(DATEDIFF(order_date, LAG(order_date) OVER (PARTITION BY customer_id ORDER BY order_date))) as std_days_between,
MIN(amount) as min_order,
MAX(amount) as max_order,
STD(amount) as std_amount
FROM orders
GROUP BY customer_id
HAVING
COUNT(*) >= 10
AND STD(amount) > AVG(amount) * 0.8 -- High amount variance
AND STD(DATEDIFF(order_date, LAG(order_date) OVER (PARTITION BY customer_id ORDER BY order_date))) >
AVG(DATEDIFF(order_date, LAG(order_date) OVER (PARTITION BY customer_id ORDER BY order_date))) * 0.5;
-- 7. Product combinations (association rules)
SELECT
oi1.product_id as product_a,
oi2.product_id as product_b,
COUNT(DISTINCT oi1.order_id) as orders_together,
COUNT(DISTINCT oi1.order_id) * 100.0 / (
SELECT COUNT(DISTINCT order_id)
FROM order_items
WHERE product_id = oi1.product_id
) as support_percentage
FROM order_items oi1
JOIN order_items oi2 ON oi1.order_id = oi2.order_id
AND oi1.product_id < oi2.product_id
GROUP BY oi1.product_id, oi2.product_id
HAVING
COUNT(DISTINCT oi1.order_id) >= 10
AND COUNT(DISTINCT oi1.order_id) * 100.0 / (
SELECT COUNT(DISTINCT order_id)
FROM order_items
WHERE product_id = oi1.product_id
) > 50;
-- 8. Seasonal categories using CV
SELECT
category,
COUNT(DISTINCT MONTH(order_date)) as months_active,
AVG(monthly_revenue) as avg_monthly_revenue,
STD(monthly_revenue) as std_monthly_revenue,
STD(monthly_revenue) / AVG(monthly_revenue) as coefficient_of_variation,
CASE
WHEN STD(monthly_revenue) / AVG(monthly_revenue) > 0.5 THEN 'Highly Seasonal'
WHEN STD(monthly_revenue) / AVG(monthly_revenue) > 0.3 THEN 'Moderately Seasonal'
ELSE 'Stable'
END as seasonality
FROM (
SELECT
p.category,
MONTH(o.order_date) as month,
SUM(oi.quantity * oi.unit_price) as monthly_revenue
FROM products p
JOIN order_items oi ON p.product_id = oi.product_id
JOIN orders o ON oi.order_id = o.order_id
WHERE o.status = 'completed'
GROUP BY p.category, MONTH(o.order_date)
) as monthly_data
GROUP BY category
HAVING STD(monthly_revenue) / AVG(monthly_revenue) > 0.3;