Mastering SQL Subqueries
Learning Objectives
- Understand subquery fundamentals and types
- Master scalar, row, and table subqueries
- Use subqueries in SELECT, WHERE, FROM, and HAVING
- Implement correlated vs non-correlated subqueries
- Apply EXISTS, IN, ANY, ALL operators
- Convert between subqueries and JOINs
- Optimize subquery performance
- Handle complex nested query scenarios
Understanding Subqueries
Subqueries are queries within queries - powerful tools for complex data retrieval and filtering! 🎯
Subquery Key Concepts
Scalar Subqueries
graph TB
SCALAR[Scalar Subquery] --> SINGLE[Single Value]
SINGLE --> SELECT[In SELECT]
SINGLE --> WHERE[In WHERE]
SINGLE --> HAVING[In HAVING]
SELECT --> S1[Column calculation]
WHERE --> W1[Comparison operator]
HAVING --> H1[Aggregate comparison]
style SCALAR fill:#dcfce7
style SINGLE fill:#dbeafe
style SELECT fill:#fef3c7
Scalar Subquery Examples
-- ========================================
-- SCALAR SUBQUERIES IN SELECT
-- ========================================
-- Add overall average to each row
SELECT
product_name,
price,
(SELECT AVG(price) FROM products) AS avg_price,
price - (SELECT AVG(price) FROM products) AS diff_from_avg
FROM products;
-- Get latest order for each customer
SELECT
c.customer_id,
c.customer_name,
(
SELECT MAX(order_date)
FROM orders o
WHERE o.customer_id = c.customer_id
) AS last_order_date,
(
SELECT COUNT(*)
FROM orders o
WHERE o.customer_id = c.customer_id
) AS total_orders
FROM customers c;
-- Calculate percentage of total
SELECT
category,
SUM(price) AS category_total,
ROUND(
SUM(price) * 100.0 / (SELECT SUM(price) FROM products),
2
) AS percent_of_total
FROM products
GROUP BY category;
-- ========================================
-- SCALAR SUBQUERIES IN WHERE
-- ========================================
-- Find products above average price
SELECT *
FROM products
WHERE price > (SELECT AVG(price) FROM products);
-- Find most recent order
SELECT *
FROM orders
WHERE order_date = (SELECT MAX(order_date) FROM orders);
-- Find customers with above-average orders
SELECT *
FROM customers
WHERE (
SELECT AVG(amount)
FROM orders
WHERE customer_id = customers.customer_id
) > (
SELECT AVG(amount) FROM orders
);
-- ========================================
-- SCALAR SUBQUERIES IN HAVING
-- ========================================
-- Groups with above-average aggregates
SELECT
category,
AVG(price) AS avg_price
FROM products
GROUP BY category
HAVING AVG(price) > (
SELECT AVG(price) FROM products
);
-- ========================================
-- SCALAR SUBQUERIES WITH CASE
-- ========================================
SELECT
product_name,
price,
CASE
WHEN price > (SELECT AVG(price) * 1.5 FROM products) THEN 'Premium'
WHEN price > (SELECT AVG(price) FROM products) THEN 'Above Average'
WHEN price > (SELECT AVG(price) * 0.5 FROM products) THEN 'Below Average'
ELSE 'Budget'
END AS price_tier
FROM products;
-- ========================================
-- ERROR HANDLING IN SCALAR SUBQUERIES
-- ========================================
-- This will error if subquery returns multiple rows
-- SELECT * FROM products
-- WHERE price = (SELECT price FROM products); -- Error!
-- Fix with aggregate
SELECT * FROM products
WHERE price = (SELECT MAX(price) FROM products);
-- Or use IN for multiple values
SELECT * FROM products
WHERE price IN (SELECT price FROM products WHERE category = 'Electronics');
Row and Table Subqueries
Row and Table Subquery Examples
-- ========================================
-- ROW SUBQUERIES
-- ========================================
-- Multi-column comparison
SELECT *
FROM orders
WHERE (customer_id, order_date) = (
SELECT customer_id, MAX(order_date)
FROM orders
GROUP BY customer_id
LIMIT 1
);
-- Find matching row combinations
SELECT *
FROM products p1
WHERE (category, price) IN (
SELECT category, MAX(price)
FROM products
GROUP BY category
);
-- Row constructor comparison
SELECT *
FROM employees
WHERE (department, salary) = (
SELECT department, MAX(salary)
FROM employees
WHERE department = 'Sales'
);
-- ========================================
-- TABLE SUBQUERIES (Derived Tables)
-- ========================================
-- Subquery in FROM clause
SELECT
category,
avg_price,
product_count
FROM (
SELECT
category,
AVG(price) AS avg_price,
COUNT(*) AS product_count
FROM products
GROUP BY category
) AS category_stats
WHERE avg_price > 100;
-- Multiple derived tables
SELECT
c.customer_name,
os.total_orders,
os.total_spent,
rs.review_count
FROM customers c
JOIN (
SELECT
customer_id,
COUNT(*) AS total_orders,
SUM(amount) AS total_spent
FROM orders
WHERE status = 'completed'
GROUP BY customer_id
) AS os ON c.customer_id = os.customer_id
LEFT JOIN (
SELECT
user_id AS customer_id,
COUNT(*) AS review_count
FROM reviews
GROUP BY user_id
) AS rs ON c.customer_id = rs.customer_id;
-- ========================================
-- LATERAL DERIVED TABLES (MySQL 8.0.14+)
-- ========================================
-- LATERAL allows derived table to reference preceding tables
SELECT
c.customer_name,
recent_orders.*
FROM customers c
CROSS JOIN LATERAL (
SELECT
COUNT(*) AS order_count,
SUM(amount) AS total_amount,
MAX(order_date) AS last_order
FROM orders o
WHERE o.customer_id = c.customer_id
AND o.order_date >= DATE_SUB(CURDATE(), INTERVAL 30 DAY)
) AS recent_orders;
-- ========================================
-- COMPLEX TABLE SUBQUERIES
-- ========================================
-- Ranking within groups using derived table
SELECT *
FROM (
SELECT
product_id,
product_name,
category,
price,
ROW_NUMBER() OVER (PARTITION BY category ORDER BY price DESC) AS price_rank
FROM products
) AS ranked_products
WHERE price_rank <= 3;
-- Pivoting data with derived tables
SELECT
customer_id,
MAX(CASE WHEN month = 1 THEN revenue END) AS jan_revenue,
MAX(CASE WHEN month = 2 THEN revenue END) AS feb_revenue,
MAX(CASE WHEN month = 3 THEN revenue END) AS mar_revenue
FROM (
SELECT
customer_id,
MONTH(order_date) AS month,
SUM(amount) AS revenue
FROM orders
WHERE YEAR(order_date) = 2024
GROUP BY customer_id, MONTH(order_date)
) AS monthly_revenue
GROUP BY customer_id;
Correlated vs Non-Correlated Subqueries
Correlated Subquery Examples
-- ========================================
-- CORRELATED SUBQUERIES
-- ========================================
-- Find products priced above their category average
SELECT
p1.product_name,
p1.category,
p1.price,
(
SELECT AVG(price)
FROM products p2
WHERE p2.category = p1.category
) AS category_avg
FROM products p1
WHERE p1.price > (
SELECT AVG(price)
FROM products p2
WHERE p2.category = p1.category
);
-- Find customers' most expensive order
SELECT
c.customer_name,
(
SELECT MAX(o.amount)
FROM orders o
WHERE o.customer_id = c.customer_id
) AS max_order,
(
SELECT o.order_date
FROM orders o
WHERE o.customer_id = c.customer_id
AND o.amount = (
SELECT MAX(o2.amount)
FROM orders o2
WHERE o2.customer_id = c.customer_id
)
LIMIT 1
) AS max_order_date
FROM customers c;
-- ========================================
-- EXISTS WITH CORRELATED SUBQUERY
-- ========================================
-- Find customers who have ordered in last 30 days
SELECT c.*
FROM customers c
WHERE EXISTS (
SELECT 1
FROM orders o
WHERE o.customer_id = c.customer_id
AND o.order_date >= DATE_SUB(CURDATE(), INTERVAL 30 DAY)
);
-- Find products never ordered
SELECT p.*
FROM products p
WHERE NOT EXISTS (
SELECT 1
FROM order_items oi
WHERE oi.product_id = p.product_id
);
-- ========================================
-- IN vs EXISTS Performance
-- ========================================
-- Using IN (non-correlated)
SELECT *
FROM customers
WHERE customer_id IN (
SELECT DISTINCT customer_id
FROM orders
WHERE amount > 1000
);
-- Using EXISTS (correlated) - often faster for large datasets
SELECT *
FROM customers c
WHERE EXISTS (
SELECT 1
FROM orders o
WHERE o.customer_id = c.customer_id
AND o.amount > 1000
);
-- ========================================
-- ANY, ALL, SOME Operators
-- ========================================
-- Products cheaper than all products in 'Electronics'
SELECT *
FROM products
WHERE price < ALL (
SELECT price
FROM products
WHERE category = 'Electronics'
);
-- Products more expensive than any product in 'Books'
SELECT *
FROM products
WHERE price > ANY (
SELECT price
FROM products
WHERE category = 'Books'
);
-- Customers who spent more than any single order average
SELECT
c.customer_id,
c.customer_name,
SUM(o.amount) AS total_spent
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id
HAVING SUM(o.amount) > ANY (
SELECT AVG(amount)
FROM orders
GROUP BY customer_id
);
Subquery vs JOIN Conversion
Performance Consideration
Converting Between Subqueries and JOINs
-- ========================================
-- SUBQUERY TO JOIN CONVERSIONS
-- ========================================
-- Example 1: IN Subquery to JOIN
-- Subquery version
SELECT *
FROM customers
WHERE customer_id IN (
SELECT customer_id
FROM orders
WHERE amount > 1000
);
-- JOIN version (often faster)
SELECT DISTINCT c.*
FROM customers c
INNER JOIN orders o ON c.customer_id = o.customer_id
WHERE o.amount > 1000;
-- ========================================
-- Example 2: NOT IN to LEFT JOIN
-- Subquery version
SELECT *
FROM products
WHERE product_id NOT IN (
SELECT product_id
FROM order_items
);
-- JOIN version (handles NULLs better)
SELECT p.*
FROM products p
LEFT JOIN order_items oi ON p.product_id = oi.product_id
WHERE oi.product_id IS NULL;
-- ========================================
-- Example 3: Correlated subquery to JOIN
-- Subquery version
SELECT
c.customer_name,
(SELECT COUNT(*) FROM orders o WHERE o.customer_id = c.customer_id) AS order_count
FROM customers c;
-- JOIN version
SELECT
c.customer_name,
COUNT(o.order_id) AS order_count
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id;
-- ========================================
-- Example 4: EXISTS to JOIN
-- Subquery version
SELECT c.*
FROM customers c
WHERE EXISTS (
SELECT 1
FROM orders o
WHERE o.customer_id = c.customer_id
AND o.status = 'pending'
);
-- JOIN version
SELECT DISTINCT c.*
FROM customers c
INNER JOIN orders o ON c.customer_id = o.customer_id
WHERE o.status = 'pending';
-- ========================================
-- WHEN TO USE SUBQUERIES vs JOINS
-- ========================================
-- Use SUBQUERY when:
-- 1. Need a single aggregate value
SELECT
product_name,
price,
price - (SELECT AVG(price) FROM products) AS diff_from_avg
FROM products;
-- 2. Checking existence without needing data
SELECT *
FROM customers c
WHERE NOT EXISTS (
SELECT 1 FROM orders o
WHERE o.customer_id = c.customer_id
);
-- 3. Complex filtering conditions
SELECT *
FROM orders
WHERE amount > (
SELECT AVG(amount) * 1.5
FROM orders
WHERE order_date >= DATE_SUB(CURDATE(), INTERVAL 30 DAY)
);
-- Use JOIN when:
-- 1. Need multiple columns from related table
SELECT
c.customer_name,
o.order_id,
o.amount
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id;
-- 2. Performance is critical with large datasets
-- 3. Need to aggregate across relationships
Advanced Subquery Patterns
Complex Subquery Patterns
-- ========================================
-- NESTED SUBQUERIES
-- ========================================
-- Multiple levels of nesting
SELECT *
FROM products
WHERE category IN (
SELECT category
FROM categories
WHERE parent_category_id IN (
SELECT category_id
FROM categories
WHERE category_name = 'Electronics'
)
);
-- ========================================
-- SUBQUERIES WITH WINDOW FUNCTIONS
-- ========================================
-- Top N per group using subquery
SELECT *
FROM (
SELECT
*,
RANK() OVER (PARTITION BY category ORDER BY price DESC) AS price_rank
FROM products
) AS ranked
WHERE price_rank <= 3;
-- ========================================
-- RECURSIVE SUBQUERIES (CTEs)
-- ========================================
WITH RECURSIVE category_tree AS (
-- Base case: root categories
SELECT
category_id,
category_name,
parent_category_id,
0 AS level,
category_name AS path
FROM categories
WHERE parent_category_id IS NULL
UNION ALL
-- Recursive case
SELECT
c.category_id,
c.category_name,
c.parent_category_id,
ct.level + 1,
CONCAT(ct.path, ' > ', c.category_name)
FROM categories c
INNER JOIN category_tree ct ON c.parent_category_id = ct.category_id
)
SELECT * FROM category_tree;
-- ========================================
-- SUBQUERY OPTIMIZATION TECHNIQUES
-- ========================================
-- 1. Push conditions into subquery
-- Bad: Filters after subquery
SELECT *
FROM (
SELECT * FROM orders
) AS o
WHERE status = 'completed';
-- Good: Filters in subquery
SELECT *
FROM (
SELECT * FROM orders WHERE status = 'completed'
) AS o;
-- 2. Use EXISTS instead of IN for large lists
-- Less efficient with large results
SELECT *
FROM customers
WHERE customer_id IN (
SELECT customer_id FROM orders WHERE amount > 1000
);
-- More efficient
SELECT *
FROM customers c
WHERE EXISTS (
SELECT 1 FROM orders o
WHERE o.customer_id = c.customer_id AND o.amount > 1000
);
-- 3. Materialize frequently used subqueries
CREATE TEMPORARY TABLE high_value_customers AS
SELECT customer_id
FROM orders
GROUP BY customer_id
HAVING SUM(amount) > 10000;
SELECT c.*
FROM customers c
INNER JOIN high_value_customers hvc ON c.customer_id = hvc.customer_id;
-- ========================================
-- SUBQUERY ERROR HANDLING
-- ========================================
-- Handle potential NULL issues
-- Problem: NULL in NOT IN
SELECT *
FROM products
WHERE category NOT IN (
SELECT category FROM discontinued_categories
-- If any category is NULL, entire query returns no rows!
);
-- Solution: Filter NULLs
SELECT *
FROM products
WHERE category NOT IN (
SELECT category
FROM discontinued_categories
WHERE category IS NOT NULL
);
-- Or use NOT EXISTS (NULL-safe)
SELECT p.*
FROM products p
WHERE NOT EXISTS (
SELECT 1
FROM discontinued_categories dc
WHERE dc.category = p.category
);
Practice Exercise
Subquery Challenges
💡 Click for Solutions
-- 1. Most expensive products in their category
SELECT p1.*
FROM products p1
WHERE p1.price = (
SELECT MAX(p2.price)
FROM products p2
WHERE p2.category = p1.category
);
-- Alternative with row subquery
SELECT *
FROM products
WHERE (category, price) IN (
SELECT category, MAX(price)
FROM products
GROUP BY category
);
-- 2. Customers with 50% above average orders
SELECT
c.*,
(
SELECT AVG(amount)
FROM orders
WHERE customer_id = c.customer_id
) AS customer_avg
FROM customers c
WHERE (
SELECT AVG(amount)
FROM orders
WHERE customer_id = c.customer_id
) > (
SELECT AVG(amount) * 1.5
FROM orders
);
-- 3. Manager-subordinate relationships (recursive)
WITH RECURSIVE org_chart AS (
SELECT
employee_id,
employee_name,
manager_id,
0 AS level,
employee_name AS reporting_chain
FROM employees
WHERE manager_id IS NULL
UNION ALL
SELECT
e.employee_id,
e.employee_name,
e.manager_id,
oc.level + 1,
CONCAT(oc.reporting_chain, ' > ', e.employee_name)
FROM employees e
INNER JOIN org_chart oc ON e.manager_id = oc.employee_id
)
SELECT * FROM org_chart;
-- 4. Customers who ordered every product in a category
SELECT c.*
FROM customers c
WHERE NOT EXISTS (
SELECT p.product_id
FROM products p
WHERE p.category = 'Electronics'
AND NOT EXISTS (
SELECT 1
FROM orders o
JOIN order_items oi ON o.order_id = oi.order_id
WHERE o.customer_id = c.customer_id
AND oi.product_id = p.product_id
)
);
-- 5. Running totals using subqueries
SELECT
order_date,
amount,
(
SELECT SUM(amount)
FROM orders o2
WHERE o2.order_date <= o1.order_date
AND o2.customer_id = o1.customer_id
) AS running_total
FROM orders o1
WHERE customer_id = 123
ORDER BY order_date;
-- 6. Second highest salary in each department
SELECT
department,
(
SELECT DISTINCT salary
FROM employees e2
WHERE e2.department = e1.department
ORDER BY salary DESC
LIMIT 1 OFFSET 1
) AS second_highest_salary
FROM employees e1
GROUP BY department;
-- Alternative approach
SELECT department, MAX(salary) AS second_highest
FROM employees e1
WHERE salary < (
SELECT MAX(salary)
FROM employees e2
WHERE e2.department = e1.department
)
GROUP BY department;
-- 7. Detect duplicate orders
SELECT o1.*
FROM orders o1
WHERE EXISTS (
SELECT 1
FROM orders o2
WHERE o2.customer_id = o1.customer_id
AND DATE(o2.order_date) = DATE(o1.order_date)
AND o2.amount = o1.amount
AND o2.order_id != o1.order_id
);
-- 8. Percentile ranking using subqueries
SELECT
product_id,
product_name,
price,
(
SELECT COUNT(*)
FROM products p2
WHERE p2.price <= p1.price
) * 100.0 / (SELECT COUNT(*) FROM products) AS percentile_rank
FROM products p1
ORDER BY price;
-- More precise percentile calculation
SELECT
product_name,
price,
ROUND(
(
SELECT COUNT(DISTINCT price)
FROM products p2
WHERE p2.price < p1.price
) * 100.0 / (
SELECT COUNT(DISTINCT price)
FROM products
),
2
) AS percentile
FROM products p1
ORDER BY price;