Skip to content

SQL 与数据库分析资源专题

SQL & Database Analytics Resources


📚 核心理论

经典教材

书名 作者 难度 推荐理由
《SQL 必知必会》 Ben Forta ⭐⭐ 入门首选,简洁实用
《Learning SQL》 Alan Beaulieu ⭐⭐ O'Reilly 经典
《SQL Cookbook》 Anthony Molinaro ⭐⭐⭐ 实用技巧大全
《高性能 MySQL》 Baron Schwartz ⭐⭐⭐⭐ 性能优化圣经
《PostgreSQL 实战》 谭峰 ⭐⭐⭐ 中文实战指南

🎓 在线课程

系统性课程

课程名称 平台 难度 链接
SQL for Data Science Coursera ⭐⭐ 链接
Learn SQL Codecademy ⭐⭐ 链接
The Complete SQL Bootcamp Udemy ⭐⭐ 链接
SQLZOO SQLZOO ⭐⭐ 链接

专项练习

平台 题型 链接
LeetCode Database 算法题 链接
HackerRank SQL 编程挑战 链接
Mode Analytics SQL Tutorial 实战教程 链接
SQLBolt 互动学习 链接

🛠️ 工具库

Python SQL 库

库名 用途 文档链接
SQLAlchemy ORM 框架 链接
psycopg2 PostgreSQL 驱动 链接
pymysql MySQL 驱动 链接
sqlite3 SQLite 内置库 文档
pyodbc ODBC 连接 链接

数据库管理工具

工具 类型 特点 链接
DBeaver 桌面应用 支持多种数据库,免费 链接
pgAdmin Web 应用 PostgreSQL 官方工具 链接
MySQL Workbench 桌面应用 MySQL 官方工具 链接
DataGrip IDE JetBrains 出品,付费 链接

📊 核心技能

窗口函数

-- 1. 排名函数
SELECT 
    name,
    score,
    RANK() OVER (ORDER BY score DESC) as rank,
    DENSE_RANK() OVER (ORDER BY score DESC) as dense_rank,
    ROW_NUMBER() OVER (ORDER BY score DESC) as row_num
FROM students;

-- 2. 聚合窗口函数
SELECT 
    date,
    sales,
    SUM(sales) OVER (ORDER BY date) as cumulative_sales,
    AVG(sales) OVER (ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) as moving_avg_7d,
    SUM(sales) OVER (PARTITION BY region) as region_total
FROM daily_sales;

-- 3. 分析函数
SELECT 
    product_id,
    sale_date,
    price,
    LAG(price, 1) OVER (PARTITION BY product_id ORDER BY sale_date) as prev_price,
    LEAD(price, 1) OVER (PARTITION BY product_id ORDER BY sale_date) as next_price,
    FIRST_VALUE(price) OVER (PARTITION BY product_id ORDER BY sale_date) as first_price,
    LAST_VALUE(price) OVER (PARTITION BY product_id ORDER BY sale_date RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as last_price
FROM product_prices;

CTE(公用表表达式)

-- 1. 基础 CTE
WITH high_value_customers AS (
    SELECT customer_id, SUM(order_amount) as total_spent
    FROM orders
    GROUP BY customer_id
    HAVING SUM(order_amount) > 10000
)
SELECT c.name, h.total_spent
FROM customers c
JOIN high_value_customers h ON c.customer_id = h.customer_id;

-- 2. 递归 CTE(层级查询)
WITH RECURSIVE org_hierarchy AS (
    -- 基础查询:获取顶层节点
    SELECT employee_id, name, manager_id, 1 as level
    FROM employees
    WHERE manager_id IS NULL

    UNION ALL

    -- 递归查询:获取下级员工
    SELECT e.employee_id, e.name, e.manager_id, oh.level + 1
    FROM employees e
    INNER JOIN org_hierarchy oh ON e.manager_id = oh.employee_id
)
SELECT * FROM org_hierarchy
ORDER BY level, employee_id;

性能优化

-- 1. 使用 EXPLAIN 分析查询计划
EXPLAIN ANALYZE
SELECT * FROM orders
WHERE customer_id = 100
AND order_date >= '2024-01-01';

-- 2. 创建合适的索引
CREATE INDEX idx_customer_order ON orders(customer_id, order_date);

-- 3. 避免 SELECT *
SELECT order_id, customer_id, order_date, amount
FROM orders;

-- 4. 使用 EXISTS 代替 IN(大数据量时)
SELECT * FROM customers c
WHERE EXISTS (
    SELECT 1 FROM orders o 
    WHERE o.customer_id = c.customer_id
    AND o.amount > 1000
);

-- 5. 优化 JOIN
SELECT o.order_id, c.name, p.product_name
FROM orders o
INNER JOIN customers c ON o.customer_id = c.customer_id
INNER JOIN order_items oi ON o.order_id = oi.order_id
INNER JOIN products p ON oi.product_id = p.product_id
WHERE o.order_date BETWEEN '2024-01-01' AND '2024-12-31';

📈 数据分析实战

用户行为分析

-- 1. 用户留存分析
WITH user_first_order AS (
    SELECT 
        customer_id,
        DATE(MIN(order_date)) as first_order_date
    FROM orders
    GROUP BY customer_id
),
retention AS (
    SELECT 
        ufo.first_order_date,
        COUNT(DISTINCT ufo.customer_id) as total_users,
        COUNT(DISTINCT CASE 
            WHEN DATEDIFF(o.order_date, ufo.first_order_date) = 7 
            THEN ufo.customer_id 
        END) as day_7_retention,
        COUNT(DISTINCT CASE 
            WHEN DATEDIFF(o.order_date, ufo.first_order_date) = 30 
            THEN ufo.customer_id 
        END) as day_30_retention
    FROM user_first_order ufo
    LEFT JOIN orders o ON ufo.customer_id = o.customer_id
    GROUP BY ufo.first_order_date
)
SELECT 
    first_order_date,
    total_users,
    ROUND(day_7_retention * 100.0 / total_users, 2) as day_7_retention_rate,
    ROUND(day_30_retention * 100.0 / total_users, 2) as day_30_retention_rate
FROM retention
ORDER BY first_order_date;

-- 2. RFM 用户分群
WITH rfm AS (
    SELECT 
        customer_id,
        DATEDIFF(CURRENT_DATE, MAX(order_date)) as recency,
        COUNT(order_id) as frequency,
        SUM(amount) as monetary
    FROM orders
    GROUP BY customer_id
),
rfm_scores AS (
    SELECT 
        customer_id,
        NTILE(4) OVER (ORDER BY recency DESC) as r_score,
        NTILE(4) OVER (ORDER BY frequency) as f_score,
        NTILE(4) OVER (ORDER BY monetary) as m_score
    FROM rfm
)
SELECT 
    customer_id,
    r_score,
    f_score,
    m_score,
    r_score + f_score + m_score as rfm_total,
    CASE 
        WHEN r_score >= 3 AND f_score >= 3 AND m_score >= 3 THEN '重要价值用户'
        WHEN r_score >= 3 AND f_score >= 2 THEN '重要发展用户'
        WHEN r_score >= 2 AND f_score >= 2 THEN '一般用户'
        ELSE '低价值用户'
    END as segment
FROM rfm_scores
ORDER BY rfm_total DESC;

销售分析

-- 1. 同期群分析(Cohort Analysis)
WITH cohort_data AS (
    SELECT 
        c.customer_id,
        DATE_TRUNC('month', c.registration_date)::date as cohort_date,
        DATE_TRUNC('month', o.order_date)::date as activity_date,
        EXTRACT(YEAR FROM AGE(o.order_date, c.registration_date)) * 12 +
        EXTRACT(MONTH FROM AGE(o.order_date, c.registration_date)) as months_active
    FROM customers c
    LEFT JOIN orders o ON c.customer_id = o.customer_id
),
cohort_size AS (
    SELECT 
        cohort_date,
        COUNT(DISTINCT customer_id) as cohort_size
    FROM cohort_data
    GROUP BY cohort_date
),
retention_data AS (
    SELECT 
        cd.cohort_date,
        cd.months_active,
        COUNT(DISTINCT cd.customer_id) as active_users
    FROM cohort_data cd
    GROUP BY cd.cohort_date, cd.months_active
)
SELECT 
    rd.cohort_date,
    cs.cohort_size,
    rd.months_active,
    rd.active_users,
    ROUND(rd.active_users * 100.0 / cs.cohort_size, 2) as retention_rate
FROM retention_data rd
JOIN cohort_size cs ON rd.cohort_date = cs.cohort_date
ORDER BY rd.cohort_date, rd.months_active;

-- 2. 销售漏斗分析
WITH funnel AS (
    SELECT 
        'Page View' as stage,
        COUNT(DISTINCT session_id) as users,
        100.0 as percentage
    FROM page_views

    UNION ALL

    SELECT 
        'Add to Cart' as stage,
        COUNT(DISTINCT session_id) as users,
        COUNT(DISTINCT session_id) * 100.0 / (SELECT COUNT(DISTINCT session_id) FROM page_views) as percentage
    FROM cart_additions

    UNION ALL

    SELECT 
        'Checkout' as stage,
        COUNT(DISTINCT session_id) as users,
        COUNT(DISTINCT session_id) * 100.0 / (SELECT COUNT(DISTINCT session_id) FROM page_views) as percentage
    FROM checkouts

    UNION ALL

    SELECT 
        'Purchase' as stage,
        COUNT(DISTINCT session_id) as users,
        COUNT(DISTINCT session_id) * 100.0 / (SELECT COUNT(DISTINCT session_id) FROM page_views) as percentage
    FROM purchases
)
SELECT * FROM funnel
ORDER BY 
    CASE stage
        WHEN 'Page View' THEN 1
        WHEN 'Add to Cart' THEN 2
        WHEN 'Checkout' THEN 3
        WHEN 'Purchase' THEN 4
    END;

🔍 常见问题

Q1: 如何处理 NULL 值?

-- 使用 COALESCE 返回第一个非 NULL 值
SELECT 
    customer_id,
    COALESCE(phone, email, 'No Contact') as contact_info
FROM customers;

-- 使用 NULLIF 避免除零错误
SELECT 
    revenue,
    cost,
    NULLIF(revenue - cost, 0) as profit
FROM sales;

Q2: 如何优化大表查询?

A: 1. 添加合适的索引 2. 使用分区表 3. 只查询需要的列 4. 使用 LIMIT 限制结果数 5. 避免在 WHERE 子句中对字段使用函数

-- 坏例子
SELECT * FROM orders WHERE YEAR(order_date) = 2024;

-- 好例子
SELECT order_id, customer_id, amount
FROM orders 
WHERE order_date >= '2024-01-01' 
AND order_date < '2025-01-01';

🔗 更多资源

官方文档

练习平台

性能工具


最后更新: 2026-06-01