Skip to main content

Course Progress

Loading...

Mastering SQL Subqueries

Duration: 90 minutes
Module 3: Advanced SQL

Learning Objectives

  • Understand subquery fundamentals and types
  • Master scalar, row, and table subqueries
  • Use subqueries in SELECT, WHERE, FROM, and HAVING
  • Implement correlated vs non-correlated subqueries
  • Apply EXISTS, IN, ANY, ALL operators
  • Convert between subqueries and JOINs
  • Optimize subquery performance
  • Handle complex nested query scenarios

Understanding Subqueries

Subqueries are queries within queries - powerful tools for complex data retrieval and filtering! 🎯

💡
Subquery Key Concepts
  • Scalar Subquery:Returns exactly one value (one row, one column)
  • Row Subquery:Returns one row with multiple columns
  • Table Subquery:Returns multiple rows and columns
  • Correlated:References outer query columns
  • Non-correlated:Independent of outer query
  • Subqueries execute for each row (correlated) or once (non-correlated)

Scalar Subqueries

graph TB SCALAR[Scalar Subquery] --> SINGLE[Single Value] SINGLE --> SELECT[In SELECT] SINGLE --> WHERE[In WHERE] SINGLE --> HAVING[In HAVING] SELECT --> S1[Column calculation] WHERE --> W1[Comparison operator] HAVING --> H1[Aggregate comparison] style SCALAR fill:#dcfce7 style SINGLE fill:#dbeafe style SELECT fill:#fef3c7

Scalar Subquery Examples

-- ========================================
-- SCALAR SUBQUERIES IN SELECT
-- ========================================

-- Add overall average to each row
SELECT 
    product_name,
    price,
    (SELECT AVG(price) FROM products) AS avg_price,
    price - (SELECT AVG(price) FROM products) AS diff_from_avg
FROM products;

-- Get latest order for each customer
SELECT 
    c.customer_id,
    c.customer_name,
    (
        SELECT MAX(order_date)
        FROM orders o
        WHERE o.customer_id = c.customer_id
    ) AS last_order_date,
    (
        SELECT COUNT(*)
        FROM orders o
        WHERE o.customer_id = c.customer_id
    ) AS total_orders
FROM customers c;

-- Calculate percentage of total
SELECT 
    category,
    SUM(price) AS category_total,
    ROUND(
        SUM(price) * 100.0 / (SELECT SUM(price) FROM products),
        2
    ) AS percent_of_total
FROM products
GROUP BY category;

-- ========================================
-- SCALAR SUBQUERIES IN WHERE
-- ========================================

-- Find products above average price
SELECT *
FROM products
WHERE price > (SELECT AVG(price) FROM products);

-- Find most recent order
SELECT *
FROM orders
WHERE order_date = (SELECT MAX(order_date) FROM orders);

-- Find customers with above-average orders
SELECT *
FROM customers
WHERE (
    SELECT AVG(amount)
    FROM orders
    WHERE customer_id = customers.customer_id
) > (
    SELECT AVG(amount) FROM orders
);

-- ========================================
-- SCALAR SUBQUERIES IN HAVING
-- ========================================

-- Groups with above-average aggregates
SELECT 
    category,
    AVG(price) AS avg_price
FROM products
GROUP BY category
HAVING AVG(price) > (
    SELECT AVG(price) FROM products
);

-- ========================================
-- SCALAR SUBQUERIES WITH CASE
-- ========================================

SELECT 
    product_name,
    price,
    CASE 
        WHEN price > (SELECT AVG(price) * 1.5 FROM products) THEN 'Premium'
        WHEN price > (SELECT AVG(price) FROM products) THEN 'Above Average'
        WHEN price > (SELECT AVG(price) * 0.5 FROM products) THEN 'Below Average'
        ELSE 'Budget'
    END AS price_tier
FROM products;

-- ========================================
-- ERROR HANDLING IN SCALAR SUBQUERIES
-- ========================================

-- This will error if subquery returns multiple rows
-- SELECT * FROM products
-- WHERE price = (SELECT price FROM products);  -- Error!

-- Fix with aggregate
SELECT * FROM products
WHERE price = (SELECT MAX(price) FROM products);

-- Or use IN for multiple values
SELECT * FROM products
WHERE price IN (SELECT price FROM products WHERE category = 'Electronics');

Row and Table Subqueries

Row and Table Subquery Examples

-- ========================================
-- ROW SUBQUERIES
-- ========================================

-- Multi-column comparison
SELECT *
FROM orders
WHERE (customer_id, order_date) = (
    SELECT customer_id, MAX(order_date)
    FROM orders
    GROUP BY customer_id
    LIMIT 1
);

-- Find matching row combinations
SELECT *
FROM products p1
WHERE (category, price) IN (
    SELECT category, MAX(price)
    FROM products
    GROUP BY category
);

-- Row constructor comparison
SELECT *
FROM employees
WHERE (department, salary) = (
    SELECT department, MAX(salary)
    FROM employees
    WHERE department = 'Sales'
);

-- ========================================
-- TABLE SUBQUERIES (Derived Tables)
-- ========================================

-- Subquery in FROM clause
SELECT 
    category,
    avg_price,
    product_count
FROM (
    SELECT 
        category,
        AVG(price) AS avg_price,
        COUNT(*) AS product_count
    FROM products
    GROUP BY category
) AS category_stats
WHERE avg_price > 100;

-- Multiple derived tables
SELECT 
    c.customer_name,
    os.total_orders,
    os.total_spent,
    rs.review_count
FROM customers c
JOIN (
    SELECT 
        customer_id,
        COUNT(*) AS total_orders,
        SUM(amount) AS total_spent
    FROM orders
    WHERE status = 'completed'
    GROUP BY customer_id
) AS os ON c.customer_id = os.customer_id
LEFT JOIN (
    SELECT 
        user_id AS customer_id,
        COUNT(*) AS review_count
    FROM reviews
    GROUP BY user_id
) AS rs ON c.customer_id = rs.customer_id;

-- ========================================
-- LATERAL DERIVED TABLES (MySQL 8.0.14+)
-- ========================================

-- LATERAL allows derived table to reference preceding tables
SELECT 
    c.customer_name,
    recent_orders.*
FROM customers c
CROSS JOIN LATERAL (
    SELECT 
        COUNT(*) AS order_count,
        SUM(amount) AS total_amount,
        MAX(order_date) AS last_order
    FROM orders o
    WHERE o.customer_id = c.customer_id
        AND o.order_date >= DATE_SUB(CURDATE(), INTERVAL 30 DAY)
) AS recent_orders;

-- ========================================
-- COMPLEX TABLE SUBQUERIES
-- ========================================

-- Ranking within groups using derived table
SELECT *
FROM (
    SELECT 
        product_id,
        product_name,
        category,
        price,
        ROW_NUMBER() OVER (PARTITION BY category ORDER BY price DESC) AS price_rank
    FROM products
) AS ranked_products
WHERE price_rank <= 3;

-- Pivoting data with derived tables
SELECT 
    customer_id,
    MAX(CASE WHEN month = 1 THEN revenue END) AS jan_revenue,
    MAX(CASE WHEN month = 2 THEN revenue END) AS feb_revenue,
    MAX(CASE WHEN month = 3 THEN revenue END) AS mar_revenue
FROM (
    SELECT 
        customer_id,
        MONTH(order_date) AS month,
        SUM(amount) AS revenue
    FROM orders
    WHERE YEAR(order_date) = 2024
    GROUP BY customer_id, MONTH(order_date)
) AS monthly_revenue
GROUP BY customer_id;

Correlated vs Non-Correlated Subqueries

Correlated Subquery Examples

-- ========================================
-- CORRELATED SUBQUERIES
-- ========================================

-- Find products priced above their category average
SELECT 
    p1.product_name,
    p1.category,
    p1.price,
    (
        SELECT AVG(price)
        FROM products p2
        WHERE p2.category = p1.category
    ) AS category_avg
FROM products p1
WHERE p1.price > (
    SELECT AVG(price)
    FROM products p2
    WHERE p2.category = p1.category
);

-- Find customers' most expensive order
SELECT 
    c.customer_name,
    (
        SELECT MAX(o.amount)
        FROM orders o
        WHERE o.customer_id = c.customer_id
    ) AS max_order,
    (
        SELECT o.order_date
        FROM orders o
        WHERE o.customer_id = c.customer_id
            AND o.amount = (
                SELECT MAX(o2.amount)
                FROM orders o2
                WHERE o2.customer_id = c.customer_id
            )
        LIMIT 1
    ) AS max_order_date
FROM customers c;

-- ========================================
-- EXISTS WITH CORRELATED SUBQUERY
-- ========================================

-- Find customers who have ordered in last 30 days
SELECT c.*
FROM customers c
WHERE EXISTS (
    SELECT 1
    FROM orders o
    WHERE o.customer_id = c.customer_id
        AND o.order_date >= DATE_SUB(CURDATE(), INTERVAL 30 DAY)
);

-- Find products never ordered
SELECT p.*
FROM products p
WHERE NOT EXISTS (
    SELECT 1
    FROM order_items oi
    WHERE oi.product_id = p.product_id
);

-- ========================================
-- IN vs EXISTS Performance
-- ========================================

-- Using IN (non-correlated)
SELECT *
FROM customers
WHERE customer_id IN (
    SELECT DISTINCT customer_id
    FROM orders
    WHERE amount > 1000
);

-- Using EXISTS (correlated) - often faster for large datasets
SELECT *
FROM customers c
WHERE EXISTS (
    SELECT 1
    FROM orders o
    WHERE o.customer_id = c.customer_id
        AND o.amount > 1000
);

-- ========================================
-- ANY, ALL, SOME Operators
-- ========================================

-- Products cheaper than all products in 'Electronics'
SELECT *
FROM products
WHERE price < ALL (
    SELECT price
    FROM products
    WHERE category = 'Electronics'
);

-- Products more expensive than any product in 'Books'
SELECT *
FROM products
WHERE price > ANY (
    SELECT price
    FROM products
    WHERE category = 'Books'
);

-- Customers who spent more than any single order average
SELECT 
    c.customer_id,
    c.customer_name,
    SUM(o.amount) AS total_spent
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id
HAVING SUM(o.amount) > ANY (
    SELECT AVG(amount)
    FROM orders
    GROUP BY customer_id
);

Subquery vs JOIN Conversion

⚠️
Performance Consideration
Many subqueries can be rewritten as JOINs for better performance. However, some patterns are clearer with subqueries.

Converting Between Subqueries and JOINs

-- ========================================
-- SUBQUERY TO JOIN CONVERSIONS
-- ========================================

-- Example 1: IN Subquery to JOIN
-- Subquery version
SELECT *
FROM customers
WHERE customer_id IN (
    SELECT customer_id
    FROM orders
    WHERE amount > 1000
);

-- JOIN version (often faster)
SELECT DISTINCT c.*
FROM customers c
INNER JOIN orders o ON c.customer_id = o.customer_id
WHERE o.amount > 1000;

-- ========================================

-- Example 2: NOT IN to LEFT JOIN
-- Subquery version
SELECT *
FROM products
WHERE product_id NOT IN (
    SELECT product_id
    FROM order_items
);

-- JOIN version (handles NULLs better)
SELECT p.*
FROM products p
LEFT JOIN order_items oi ON p.product_id = oi.product_id
WHERE oi.product_id IS NULL;

-- ========================================

-- Example 3: Correlated subquery to JOIN
-- Subquery version
SELECT 
    c.customer_name,
    (SELECT COUNT(*) FROM orders o WHERE o.customer_id = c.customer_id) AS order_count
FROM customers c;

-- JOIN version
SELECT 
    c.customer_name,
    COUNT(o.order_id) AS order_count
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id;

-- ========================================

-- Example 4: EXISTS to JOIN
-- Subquery version
SELECT c.*
FROM customers c
WHERE EXISTS (
    SELECT 1
    FROM orders o
    WHERE o.customer_id = c.customer_id
        AND o.status = 'pending'
);

-- JOIN version
SELECT DISTINCT c.*
FROM customers c
INNER JOIN orders o ON c.customer_id = o.customer_id
WHERE o.status = 'pending';

-- ========================================
-- WHEN TO USE SUBQUERIES vs JOINS
-- ========================================

-- Use SUBQUERY when:
-- 1. Need a single aggregate value
SELECT 
    product_name,
    price,
    price - (SELECT AVG(price) FROM products) AS diff_from_avg
FROM products;

-- 2. Checking existence without needing data
SELECT *
FROM customers c
WHERE NOT EXISTS (
    SELECT 1 FROM orders o 
    WHERE o.customer_id = c.customer_id
);

-- 3. Complex filtering conditions
SELECT *
FROM orders
WHERE amount > (
    SELECT AVG(amount) * 1.5
    FROM orders
    WHERE order_date >= DATE_SUB(CURDATE(), INTERVAL 30 DAY)
);

-- Use JOIN when:
-- 1. Need multiple columns from related table
SELECT 
    c.customer_name,
    o.order_id,
    o.amount
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id;

-- 2. Performance is critical with large datasets
-- 3. Need to aggregate across relationships

Advanced Subquery Patterns

Complex Subquery Patterns

-- ========================================
-- NESTED SUBQUERIES
-- ========================================

-- Multiple levels of nesting
SELECT *
FROM products
WHERE category IN (
    SELECT category
    FROM categories
    WHERE parent_category_id IN (
        SELECT category_id
        FROM categories
        WHERE category_name = 'Electronics'
    )
);

-- ========================================
-- SUBQUERIES WITH WINDOW FUNCTIONS
-- ========================================

-- Top N per group using subquery
SELECT *
FROM (
    SELECT 
        *,
        RANK() OVER (PARTITION BY category ORDER BY price DESC) AS price_rank
    FROM products
) AS ranked
WHERE price_rank <= 3;

-- ========================================
-- RECURSIVE SUBQUERIES (CTEs)
-- ========================================

WITH RECURSIVE category_tree AS (
    -- Base case: root categories
    SELECT 
        category_id,
        category_name,
        parent_category_id,
        0 AS level,
        category_name AS path
    FROM categories
    WHERE parent_category_id IS NULL
    
    UNION ALL
    
    -- Recursive case
    SELECT 
        c.category_id,
        c.category_name,
        c.parent_category_id,
        ct.level + 1,
        CONCAT(ct.path, ' > ', c.category_name)
    FROM categories c
    INNER JOIN category_tree ct ON c.parent_category_id = ct.category_id
)
SELECT * FROM category_tree;

-- ========================================
-- SUBQUERY OPTIMIZATION TECHNIQUES
-- ========================================

-- 1. Push conditions into subquery
-- Bad: Filters after subquery
SELECT *
FROM (
    SELECT * FROM orders
) AS o
WHERE status = 'completed';

-- Good: Filters in subquery
SELECT *
FROM (
    SELECT * FROM orders WHERE status = 'completed'
) AS o;

-- 2. Use EXISTS instead of IN for large lists
-- Less efficient with large results
SELECT *
FROM customers
WHERE customer_id IN (
    SELECT customer_id FROM orders WHERE amount > 1000
);

-- More efficient
SELECT *
FROM customers c
WHERE EXISTS (
    SELECT 1 FROM orders o 
    WHERE o.customer_id = c.customer_id AND o.amount > 1000
);

-- 3. Materialize frequently used subqueries
CREATE TEMPORARY TABLE high_value_customers AS
SELECT customer_id
FROM orders
GROUP BY customer_id
HAVING SUM(amount) > 10000;

SELECT c.*
FROM customers c
INNER JOIN high_value_customers hvc ON c.customer_id = hvc.customer_id;

-- ========================================
-- SUBQUERY ERROR HANDLING
-- ========================================

-- Handle potential NULL issues
-- Problem: NULL in NOT IN
SELECT *
FROM products
WHERE category NOT IN (
    SELECT category FROM discontinued_categories
    -- If any category is NULL, entire query returns no rows!
);

-- Solution: Filter NULLs
SELECT *
FROM products
WHERE category NOT IN (
    SELECT category 
    FROM discontinued_categories 
    WHERE category IS NOT NULL
);

-- Or use NOT EXISTS (NULL-safe)
SELECT p.*
FROM products p
WHERE NOT EXISTS (
    SELECT 1 
    FROM discontinued_categories dc 
    WHERE dc.category = p.category
);

Practice Exercise

💻
Subquery Challenges
Solve these complex subquery scenarios:
  1. Find products that are the most expensive in their category
  2. List customers whose average order exceeds the overall average by 50%
  3. Find all manager-subordinate relationships (recursive)
  4. Identify customers who ordered every product in a category
  5. Calculate running totals using subqueries (not window functions)
  6. Find the second highest salary in each department
  7. Detect duplicate orders (same customer, date, amount)
  8. Build a percentile ranking system using subqueries
💡 Click for Solutions
-- 1. Most expensive products in their category
SELECT p1.*
FROM products p1
WHERE p1.price = (
    SELECT MAX(p2.price)
    FROM products p2
    WHERE p2.category = p1.category
);

-- Alternative with row subquery
SELECT *
FROM products
WHERE (category, price) IN (
    SELECT category, MAX(price)
    FROM products
    GROUP BY category
);

-- 2. Customers with 50% above average orders
SELECT 
    c.*,
    (
        SELECT AVG(amount)
        FROM orders
        WHERE customer_id = c.customer_id
    ) AS customer_avg
FROM customers c
WHERE (
    SELECT AVG(amount)
    FROM orders
    WHERE customer_id = c.customer_id
) > (
    SELECT AVG(amount) * 1.5
    FROM orders
);

-- 3. Manager-subordinate relationships (recursive)
WITH RECURSIVE org_chart AS (
    SELECT 
        employee_id,
        employee_name,
        manager_id,
        0 AS level,
        employee_name AS reporting_chain
    FROM employees
    WHERE manager_id IS NULL
    
    UNION ALL
    
    SELECT 
        e.employee_id,
        e.employee_name,
        e.manager_id,
        oc.level + 1,
        CONCAT(oc.reporting_chain, ' > ', e.employee_name)
    FROM employees e
    INNER JOIN org_chart oc ON e.manager_id = oc.employee_id
)
SELECT * FROM org_chart;

-- 4. Customers who ordered every product in a category
SELECT c.*
FROM customers c
WHERE NOT EXISTS (
    SELECT p.product_id
    FROM products p
    WHERE p.category = 'Electronics'
    AND NOT EXISTS (
        SELECT 1
        FROM orders o
        JOIN order_items oi ON o.order_id = oi.order_id
        WHERE o.customer_id = c.customer_id
        AND oi.product_id = p.product_id
    )
);

-- 5. Running totals using subqueries
SELECT 
    order_date,
    amount,
    (
        SELECT SUM(amount)
        FROM orders o2
        WHERE o2.order_date <= o1.order_date
            AND o2.customer_id = o1.customer_id
    ) AS running_total
FROM orders o1
WHERE customer_id = 123
ORDER BY order_date;

-- 6. Second highest salary in each department
SELECT 
    department,
    (
        SELECT DISTINCT salary
        FROM employees e2
        WHERE e2.department = e1.department
        ORDER BY salary DESC
        LIMIT 1 OFFSET 1
    ) AS second_highest_salary
FROM employees e1
GROUP BY department;

-- Alternative approach
SELECT department, MAX(salary) AS second_highest
FROM employees e1
WHERE salary < (
    SELECT MAX(salary)
    FROM employees e2
    WHERE e2.department = e1.department
)
GROUP BY department;

-- 7. Detect duplicate orders
SELECT o1.*
FROM orders o1
WHERE EXISTS (
    SELECT 1
    FROM orders o2
    WHERE o2.customer_id = o1.customer_id
        AND DATE(o2.order_date) = DATE(o1.order_date)
        AND o2.amount = o1.amount
        AND o2.order_id != o1.order_id
);

-- 8. Percentile ranking using subqueries
SELECT 
    product_id,
    product_name,
    price,
    (
        SELECT COUNT(*)
        FROM products p2
        WHERE p2.price <= p1.price
    ) * 100.0 / (SELECT COUNT(*) FROM products) AS percentile_rank
FROM products p1
ORDER BY price;

-- More precise percentile calculation
SELECT 
    product_name,
    price,
    ROUND(
        (
            SELECT COUNT(DISTINCT price)
            FROM products p2
            WHERE p2.price < p1.price
        ) * 100.0 / (
            SELECT COUNT(DISTINCT price)
            FROM products
        ),
        2
    ) AS percentile
FROM products p1
ORDER BY price;

Additional Resources