Skip to main content

Course Progress

Loading...

HAVING Clause Deep Dive

Duration: 75 minutes
Module 3: Advanced Topic

Learning Objectives

  • Master HAVING clause mechanics and execution timing
  • Understand HAVING vs WHERE in depth
  • Use complex aggregate conditions in HAVING
  • Implement subqueries within HAVING
  • Apply HAVING with window functions
  • Optimize HAVING clause performance
  • Handle edge cases and common pitfalls
  • Build complex analytical queries with HAVING

HAVING Clause Mechanics

HAVING is SQL's post-grouping filter, allowing you to filter aggregated results after GROUP BY has created groups! 🎯

💡
HAVING Key Concepts
  • HAVING filters AFTER grouping is complete
  • Can reference aggregate functions (COUNT, SUM, AVG, etc.)
  • Can reference columns in GROUP BY clause
  • Cannot reference ungrouped, non-aggregate columns
  • Executes before SELECT column aliases are available
  • More expensive than WHERE (processes grouped data)

HAVING vs WHERE: Deep Comparison

graph TB FILTER[Filtering Data] --> WHERE[WHERE Clause] FILTER --> HAVING[HAVING Clause] WHERE --> W1[Pre-aggregation] WHERE --> W2[Row-level] WHERE --> W3[No aggregates] WHERE --> W4[More efficient] HAVING --> H1[Post-aggregation] HAVING --> H2[Group-level] HAVING --> H3[Uses aggregates] HAVING --> H4[Processes less data] style FILTER fill:#dcfce7 style WHERE fill:#dbeafe style HAVING fill:#fef3c7

WHERE vs HAVING Examples

-- ========================================
-- COMPARING WHERE AND HAVING
-- ========================================

-- Wrong: Cannot use aggregate in WHERE
-- This will cause an error:
-- SELECT category, COUNT(*)
-- FROM products
-- WHERE COUNT(*) > 10  -- ERROR!
-- GROUP BY category;

-- Correct: Use HAVING for aggregate conditions
SELECT category, COUNT(*)
FROM products
GROUP BY category
HAVING COUNT(*) > 10;

-- ========================================
-- OPTIMAL FILTERING STRATEGY
-- ========================================

-- Inefficient: Filtering after grouping
SELECT 
    customer_id,
    COUNT(*) as order_count,
    SUM(amount) as total_spent
FROM orders
GROUP BY customer_id
HAVING customer_id IN (
    SELECT customer_id 
    FROM customers 
    WHERE country = 'USA'
);

-- Efficient: Filter before grouping
SELECT 
    o.customer_id,
    COUNT(*) as order_count,
    SUM(o.amount) as total_spent
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
WHERE c.country = 'USA'  -- Filter early!
GROUP BY o.customer_id;

-- ========================================
-- COMBINED WHERE AND HAVING
-- ========================================

-- Complex filtering at both levels
SELECT 
    DATE_FORMAT(o.order_date, '%Y-%m') as month,
    c.customer_segment,
    COUNT(DISTINCT o.customer_id) as unique_customers,
    COUNT(o.order_id) as total_orders,
    SUM(o.amount) as revenue,
    AVG(o.amount) as avg_order_value
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
WHERE 
    -- Row-level filters (WHERE)
    o.status = 'completed'
    AND o.order_date >= '2024-01-01'
    AND c.is_active = TRUE
GROUP BY 
    DATE_FORMAT(o.order_date, '%Y-%m'),
    c.customer_segment
HAVING 
    -- Group-level filters (HAVING)
    COUNT(DISTINCT o.customer_id) >= 10  -- At least 10 customers
    AND SUM(o.amount) > 5000             -- Revenue threshold
    AND AVG(o.amount) > 100              -- Average order threshold
ORDER BY month DESC, revenue DESC;

-- ========================================
-- EDGE CASE: Column in both WHERE and HAVING
-- ========================================

-- You can reference grouped columns in HAVING
SELECT 
    category,
    brand,
    COUNT(*) as product_count,
    AVG(price) as avg_price
FROM products
WHERE 
    category IN ('Electronics', 'Computers')  -- Pre-filter categories
GROUP BY category, brand
HAVING 
    category = 'Electronics'  -- Valid! category is in GROUP BY
    AND COUNT(*) > 5
    AND AVG(price) > 500;

Complex HAVING Conditions

Advanced HAVING Patterns

-- ========================================
-- HAVING WITH COMPLEX AGGREGATES
-- ========================================

-- Multiple aggregate conditions with calculations
SELECT 
    product_category,
    supplier_id,
    COUNT(*) as product_count,
    AVG(price) as avg_price,
    MIN(price) as min_price,
    MAX(price) as max_price,
    MAX(price) - MIN(price) as price_range,
    STD(price) as price_std_dev
FROM products
GROUP BY product_category, supplier_id
HAVING 
    COUNT(*) >= 5  -- Minimum products
    AND MAX(price) - MIN(price) > AVG(price) * 0.5  -- Significant range
    AND STD(price) > 10  -- Price variation
    AND AVG(price) BETWEEN 50 AND 500;  -- Price band

-- ========================================
-- HAVING WITH CONDITIONAL AGGREGATES
-- ========================================

-- Using CASE within aggregates in HAVING
SELECT 
    customer_id,
    COUNT(*) as total_orders,
    SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed_orders,
    SUM(CASE WHEN status = 'cancelled' THEN 1 ELSE 0 END) as cancelled_orders,
    SUM(amount) as total_amount
FROM orders
GROUP BY customer_id
HAVING 
    -- More than 20% cancellation rate
    SUM(CASE WHEN status = 'cancelled' THEN 1 ELSE 0 END) / COUNT(*) > 0.2
    -- And significant order volume
    AND COUNT(*) >= 10
    -- But still profitable
    AND SUM(CASE WHEN status = 'completed' THEN amount ELSE 0 END) > 1000;

-- ========================================
-- HAVING WITH SUBQUERIES
-- ========================================

-- Compare groups to overall statistics
SELECT 
    department,
    COUNT(*) as employee_count,
    AVG(salary) as avg_salary,
    SUM(salary) as total_payroll
FROM employees
GROUP BY department
HAVING 
    -- Department average above company average
    AVG(salary) > (SELECT AVG(salary) FROM employees)
    -- And represents significant portion of payroll
    AND SUM(salary) > (SELECT SUM(salary) * 0.1 FROM employees);

-- Dynamic thresholds with subqueries
SELECT 
    product_id,
    COUNT(*) as review_count,
    AVG(rating) as avg_rating
FROM reviews
GROUP BY product_id
HAVING 
    -- Above median review count
    COUNT(*) > (
        SELECT AVG(review_count)
        FROM (
            SELECT COUNT(*) as review_count
            FROM reviews
            GROUP BY product_id
        ) as counts
    )
    -- And rating in top quartile
    AND AVG(rating) > (
        SELECT DISTINCT rating
        FROM reviews
        ORDER BY rating DESC
        LIMIT 1 OFFSET (SELECT COUNT(DISTINCT rating) * 0.25 FROM reviews)
    );

-- ========================================
-- HAVING WITH CORRELATED SUBQUERIES
-- ========================================

-- Find categories performing better than their historical average
SELECT 
    category,
    YEAR(order_date) as year,
    MONTH(order_date) as month,
    SUM(amount) as monthly_revenue
FROM orders o
JOIN products p ON o.product_id = p.product_id
GROUP BY category, YEAR(order_date), MONTH(order_date)
HAVING 
    SUM(amount) > (
        -- Historical average for this category
        SELECT AVG(monthly_rev)
        FROM (
            SELECT SUM(amount) as monthly_rev
            FROM orders o2
            JOIN products p2 ON o2.product_id = p2.product_id
            WHERE p2.category = p.category
            AND o2.order_date < DATE_SUB(LAST_DAY(CONCAT(year, '-', month, '-01')), INTERVAL 0 DAY)
            GROUP BY YEAR(o2.order_date), MONTH(o2.order_date)
        ) as historical
    ) * 1.1;  -- 10% above historical average

HAVING Performance Optimization

⚠️
Performance Considerations
HAVING operates on aggregated data, so optimization focuses on reducing the groups processed and efficient aggregate calculations.

Optimizing HAVING Queries

-- ========================================
-- OPTIMIZATION STRATEGIES
-- ========================================

-- Strategy 1: Filter early with WHERE
-- Bad: HAVING does unnecessary grouping
SELECT 
    customer_id,
    COUNT(*),
    SUM(amount)
FROM orders
GROUP BY customer_id
HAVING customer_id IN (1, 2, 3, 4, 5);  -- Should be WHERE

-- Good: WHERE filters before grouping
SELECT 
    customer_id,
    COUNT(*),
    SUM(amount)
FROM orders
WHERE customer_id IN (1, 2, 3, 4, 5)  -- Filter first
GROUP BY customer_id;

-- ========================================
-- Strategy 2: Use indexes for GROUP BY
-- ========================================

-- Create covering index for GROUP BY + aggregates
CREATE INDEX idx_orders_customer_amount 
ON orders(customer_id, status, amount);

-- This query can use index only
SELECT 
    customer_id,
    COUNT(*) as order_count,
    SUM(amount) as total
FROM orders
WHERE status = 'completed'
GROUP BY customer_id
HAVING COUNT(*) > 10;

-- ========================================
-- Strategy 3: Materialized aggregates
-- ========================================

-- Create summary table for frequent HAVING queries
CREATE TABLE customer_summary (
    customer_id INT PRIMARY KEY,
    total_orders INT,
    total_amount DECIMAL(10,2),
    avg_order_value DECIMAL(10,2),
    last_order_date DATE,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
    INDEX idx_metrics (total_orders, total_amount)
);

-- Populate summary (run periodically)
INSERT INTO customer_summary
SELECT 
    customer_id,
    COUNT(*),
    SUM(amount),
    AVG(amount),
    MAX(order_date),
    NOW()
FROM orders
WHERE status = 'completed'
GROUP BY customer_id
ON DUPLICATE KEY UPDATE
    total_orders = VALUES(total_orders),
    total_amount = VALUES(total_amount),
    avg_order_value = VALUES(avg_order_value),
    last_order_date = VALUES(last_order_date);

-- Now HAVING queries are instant
SELECT *
FROM customer_summary
WHERE total_orders > 10  -- No HAVING needed!
    AND total_amount > 1000;

-- ========================================
-- Strategy 4: Optimize complex HAVING
-- ========================================

-- Inefficient: Multiple passes for statistics
SELECT 
    category,
    COUNT(*) as cnt,
    AVG(price) as avg_price,
    STD(price) as std_price
FROM products
GROUP BY category
HAVING 
    STD(price) / AVG(price) > 0.5  -- Coefficient of variation
    AND COUNT(*) > 10;

-- Efficient: Calculate once, filter once
SELECT *
FROM (
    SELECT 
        category,
        COUNT(*) as cnt,
        AVG(price) as avg_price,
        STD(price) as std_price,
        STD(price) / AVG(price) as coeff_variation
    FROM products
    GROUP BY category
) as stats
WHERE coeff_variation > 0.5
    AND cnt > 10;

-- ========================================
-- Strategy 5: Partition elimination
-- ========================================

-- If table is partitioned by date
ALTER TABLE orders
PARTITION BY RANGE (YEAR(order_date)) (
    PARTITION p2023 VALUES LESS THAN (2024),
    PARTITION p2024 VALUES LESS THAN (2025),
    PARTITION p2025 VALUES LESS THAN (2026)
);

-- This query only scans relevant partitions
SELECT 
    customer_id,
    COUNT(*) as orders_2024
FROM orders
WHERE order_date >= '2024-01-01' 
    AND order_date < '2025-01-01'
GROUP BY customer_id
HAVING COUNT(*) > 50;

Common HAVING Pitfalls and Solutions

Avoiding HAVING Mistakes

-- ========================================
-- PITFALL 1: Column Aliases in HAVING
-- ========================================

-- MySQL allows this (non-standard):
SELECT 
    customer_id,
    COUNT(*) as order_count
FROM orders
GROUP BY customer_id
HAVING order_count > 10;  -- Works in MySQL

-- Standard SQL requires:
SELECT 
    customer_id,
    COUNT(*) as order_count
FROM orders
GROUP BY customer_id
HAVING COUNT(*) > 10;  -- Portable

-- ========================================
-- PITFALL 2: NULL Handling in HAVING
-- ========================================

-- NULLs can cause unexpected results
SELECT 
    category,
    AVG(discount) as avg_discount
FROM products
GROUP BY category
HAVING AVG(discount) > 10;  -- Excludes NULL discount groups

-- Handle NULLs explicitly
SELECT 
    category,
    AVG(COALESCE(discount, 0)) as avg_discount,
    COUNT(discount) as products_with_discount
FROM products
GROUP BY category
HAVING AVG(COALESCE(discount, 0)) > 10
    OR COUNT(discount) = 0;  -- Include all-NULL groups

-- ========================================
-- PITFALL 3: HAVING Without GROUP BY
-- ========================================

-- Valid but treats entire result as one group
SELECT 
    COUNT(*) as total,
    AVG(price) as avg_price
FROM products
HAVING AVG(price) > 100;  -- Filters the single group

-- ========================================
-- PITFALL 4: Mixing WHERE and HAVING Logic
-- ========================================

-- Confusing: Mixes row and group logic
SELECT 
    category,
    COUNT(*) as product_count
FROM products
GROUP BY category
HAVING COUNT(*) > 5 
    AND category != 'Discontinued';  -- Should be WHERE

-- Clear: Separate row and group filters
SELECT 
    category,
    COUNT(*) as product_count
FROM products
WHERE category != 'Discontinued'  -- Row filter
GROUP BY category
HAVING COUNT(*) > 5;  -- Group filter

-- ========================================
-- PITFALL 5: Incorrect Aggregate Scope
-- ========================================

-- Wrong: Trying to filter individual rows in HAVING
SELECT 
    customer_id,
    order_id,  -- Not in GROUP BY!
    SUM(amount)
FROM orders
GROUP BY customer_id
HAVING amount > 100;  -- Error: amount not aggregated

-- Correct: Aggregate or include in GROUP BY
SELECT 
    customer_id,
    COUNT(*) as order_count,
    SUM(amount) as total
FROM orders
WHERE amount > 100  -- Filter individual orders
GROUP BY customer_id
HAVING SUM(amount) > 1000;  -- Filter grouped totals

Practice Exercise

💻
Advanced HAVING Challenges
Solve these complex HAVING scenarios:
  1. Find customer segments where average order value exceeds median
  2. Identify products with abnormal return rates (statistical outliers)
  3. Find months where sales exceeded both previous month and year-ago month
  4. Detect categories with high price variance but stable sales
  5. Find suppliers whose product ratings improved consistently
  6. Identify customers with irregular purchase patterns
  7. Find product combinations frequently bought together (>50% of orders)
  8. Detect seasonal categories using coefficient of variation
💡 Click for Solutions
-- 1. Customer segments where average exceeds median
WITH segment_stats AS (
    SELECT 
        customer_segment,
        AVG(order_value) as avg_value,
        COUNT(*) as order_count
    FROM orders o
    JOIN customers c ON o.customer_id = c.customer_id
    GROUP BY customer_segment
),
median_calc AS (
    SELECT 
        customer_segment,
        order_value,
        ROW_NUMBER() OVER (PARTITION BY customer_segment ORDER BY order_value) as rn,
        COUNT(*) OVER (PARTITION BY customer_segment) as cnt
    FROM orders o
    JOIN customers c ON o.customer_id = c.customer_id
)
SELECT 
    s.customer_segment,
    s.avg_value,
    m.median_value,
    s.order_count
FROM segment_stats s
JOIN (
    SELECT 
        customer_segment,
        AVG(order_value) as median_value
    FROM median_calc
    WHERE rn IN (FLOOR((cnt + 1) / 2), CEIL((cnt + 1) / 2))
    GROUP BY customer_segment
) m ON s.customer_segment = m.customer_segment
WHERE s.avg_value > m.median_value * 1.2;

-- 2. Products with abnormal return rates
SELECT 
    product_id,
    product_name,
    COUNT(*) as total_sales,
    SUM(CASE WHEN returned = TRUE THEN 1 ELSE 0 END) as returns,
    AVG(CASE WHEN returned = TRUE THEN 1 ELSE 0 END) as return_rate,
    STD(CASE WHEN returned = TRUE THEN 1 ELSE 0 END) as return_std
FROM order_items oi
JOIN products p ON oi.product_id = p.product_id
GROUP BY product_id
HAVING 
    COUNT(*) >= 20  -- Minimum sample size
    AND AVG(CASE WHEN returned = TRUE THEN 1 ELSE 0 END) > (
        SELECT AVG(return_rate) + 2 * STD(return_rate)
        FROM (
            SELECT AVG(CASE WHEN returned = TRUE THEN 1 ELSE 0 END) as return_rate
            FROM order_items
            GROUP BY product_id
        ) as rates
    );

-- 3. Months exceeding both previous and year-ago
WITH monthly_sales AS (
    SELECT 
        DATE_FORMAT(order_date, '%Y-%m') as month,
        SUM(amount) as revenue
    FROM orders
    WHERE status = 'completed'
    GROUP BY DATE_FORMAT(order_date, '%Y-%m')
)
SELECT 
    month,
    revenue,
    LAG(revenue, 1) OVER (ORDER BY month) as prev_month,
    LAG(revenue, 12) OVER (ORDER BY month) as year_ago
FROM monthly_sales
HAVING 
    revenue > LAG(revenue, 1) OVER (ORDER BY month) * 1.05
    AND revenue > LAG(revenue, 12) OVER (ORDER BY month) * 1.10;

-- 4. High price variance but stable sales
SELECT 
    category,
    COUNT(DISTINCT product_id) as products,
    STD(price) as price_std,
    AVG(price) as price_avg,
    STD(monthly_sales) as sales_std,
    AVG(monthly_sales) as sales_avg,
    STD(price) / AVG(price) as price_cv,
    STD(monthly_sales) / AVG(monthly_sales) as sales_cv
FROM (
    SELECT 
        p.category,
        p.product_id,
        p.price,
        DATE_FORMAT(o.order_date, '%Y-%m') as month,
        SUM(oi.quantity) as monthly_sales
    FROM products p
    JOIN order_items oi ON p.product_id = oi.product_id
    JOIN orders o ON oi.order_id = o.order_id
    GROUP BY p.category, p.product_id, month
) as product_sales
GROUP BY category
HAVING 
    STD(price) / AVG(price) > 0.5  -- High price variance
    AND STD(monthly_sales) / AVG(monthly_sales) < 0.2;  -- Low sales variance

-- 5. Suppliers with consistent rating improvement
WITH supplier_ratings AS (
    SELECT 
        s.supplier_id,
        s.supplier_name,
        QUARTER(r.review_date) as quarter,
        YEAR(r.review_date) as year,
        AVG(r.rating) as avg_rating,
        ROW_NUMBER() OVER (PARTITION BY s.supplier_id ORDER BY YEAR(r.review_date), QUARTER(r.review_date)) as quarter_num
    FROM suppliers s
    JOIN products p ON s.supplier_id = p.supplier_id
    JOIN reviews r ON p.product_id = r.product_id
    GROUP BY s.supplier_id, year, quarter
)
SELECT 
    supplier_id,
    supplier_name,
    COUNT(*) as quarters_tracked,
    MIN(avg_rating) as min_rating,
    MAX(avg_rating) as max_rating
FROM supplier_ratings
GROUP BY supplier_id
HAVING 
    COUNT(*) >= 4  -- At least 4 quarters
    AND MIN(CASE WHEN quarter_num > 1 THEN avg_rating END) > 
        MIN(CASE WHEN quarter_num = 1 THEN avg_rating END);

-- 6. Customers with irregular patterns
SELECT 
    customer_id,
    COUNT(*) as order_count,
    AVG(DATEDIFF(order_date, LAG(order_date) OVER (PARTITION BY customer_id ORDER BY order_date))) as avg_days_between,
    STD(DATEDIFF(order_date, LAG(order_date) OVER (PARTITION BY customer_id ORDER BY order_date))) as std_days_between,
    MIN(amount) as min_order,
    MAX(amount) as max_order,
    STD(amount) as std_amount
FROM orders
GROUP BY customer_id
HAVING 
    COUNT(*) >= 10
    AND STD(amount) > AVG(amount) * 0.8  -- High amount variance
    AND STD(DATEDIFF(order_date, LAG(order_date) OVER (PARTITION BY customer_id ORDER BY order_date))) > 
        AVG(DATEDIFF(order_date, LAG(order_date) OVER (PARTITION BY customer_id ORDER BY order_date))) * 0.5;

-- 7. Product combinations (association rules)
SELECT 
    oi1.product_id as product_a,
    oi2.product_id as product_b,
    COUNT(DISTINCT oi1.order_id) as orders_together,
    COUNT(DISTINCT oi1.order_id) * 100.0 / (
        SELECT COUNT(DISTINCT order_id)
        FROM order_items
        WHERE product_id = oi1.product_id
    ) as support_percentage
FROM order_items oi1
JOIN order_items oi2 ON oi1.order_id = oi2.order_id
    AND oi1.product_id < oi2.product_id
GROUP BY oi1.product_id, oi2.product_id
HAVING 
    COUNT(DISTINCT oi1.order_id) >= 10
    AND COUNT(DISTINCT oi1.order_id) * 100.0 / (
        SELECT COUNT(DISTINCT order_id)
        FROM order_items
        WHERE product_id = oi1.product_id
    ) > 50;

-- 8. Seasonal categories using CV
SELECT 
    category,
    COUNT(DISTINCT MONTH(order_date)) as months_active,
    AVG(monthly_revenue) as avg_monthly_revenue,
    STD(monthly_revenue) as std_monthly_revenue,
    STD(monthly_revenue) / AVG(monthly_revenue) as coefficient_of_variation,
    CASE 
        WHEN STD(monthly_revenue) / AVG(monthly_revenue) > 0.5 THEN 'Highly Seasonal'
        WHEN STD(monthly_revenue) / AVG(monthly_revenue) > 0.3 THEN 'Moderately Seasonal'
        ELSE 'Stable'
    END as seasonality
FROM (
    SELECT 
        p.category,
        MONTH(o.order_date) as month,
        SUM(oi.quantity * oi.unit_price) as monthly_revenue
    FROM products p
    JOIN order_items oi ON p.product_id = oi.product_id
    JOIN orders o ON oi.order_id = o.order_id
    WHERE o.status = 'completed'
    GROUP BY p.category, MONTH(o.order_date)
) as monthly_data
GROUP BY category
HAVING STD(monthly_revenue) / AVG(monthly_revenue) > 0.3;

Additional Resources