SQL 与数据库分析资源专题
SQL & Database Analytics Resources
📚 核心理论
经典教材
| 书名 |
作者 |
难度 |
推荐理由 |
| 《SQL 必知必会》 |
Ben Forta |
⭐⭐ |
入门首选,简洁实用 |
| 《Learning SQL》 |
Alan Beaulieu |
⭐⭐ |
O'Reilly 经典 |
| 《SQL Cookbook》 |
Anthony Molinaro |
⭐⭐⭐ |
实用技巧大全 |
| 《高性能 MySQL》 |
Baron Schwartz |
⭐⭐⭐⭐ |
性能优化圣经 |
| 《PostgreSQL 实战》 |
谭峰 |
⭐⭐⭐ |
中文实战指南 |
🎓 在线课程
系统性课程
| 课程名称 |
平台 |
难度 |
链接 |
| SQL for Data Science |
Coursera |
⭐⭐ |
链接 |
| Learn SQL |
Codecademy |
⭐⭐ |
链接 |
| The Complete SQL Bootcamp |
Udemy |
⭐⭐ |
链接 |
| SQLZOO |
SQLZOO |
⭐⭐ |
链接 |
专项练习
| 平台 |
题型 |
链接 |
| LeetCode Database |
算法题 |
链接 |
| HackerRank SQL |
编程挑战 |
链接 |
| Mode Analytics SQL Tutorial |
实战教程 |
链接 |
| SQLBolt |
互动学习 |
链接 |
🛠️ 工具库
Python SQL 库
| 库名 |
用途 |
文档链接 |
| SQLAlchemy |
ORM 框架 |
链接 |
| psycopg2 |
PostgreSQL 驱动 |
链接 |
| pymysql |
MySQL 驱动 |
链接 |
| sqlite3 |
SQLite 内置库 |
文档 |
| pyodbc |
ODBC 连接 |
链接 |
数据库管理工具
| 工具 |
类型 |
特点 |
链接 |
| DBeaver |
桌面应用 |
支持多种数据库,免费 |
链接 |
| pgAdmin |
Web 应用 |
PostgreSQL 官方工具 |
链接 |
| MySQL Workbench |
桌面应用 |
MySQL 官方工具 |
链接 |
| DataGrip |
IDE |
JetBrains 出品,付费 |
链接 |
📊 核心技能
窗口函数
-- 1. 排名函数
SELECT
name,
score,
RANK() OVER (ORDER BY score DESC) as rank,
DENSE_RANK() OVER (ORDER BY score DESC) as dense_rank,
ROW_NUMBER() OVER (ORDER BY score DESC) as row_num
FROM students;
-- 2. 聚合窗口函数
SELECT
date,
sales,
SUM(sales) OVER (ORDER BY date) as cumulative_sales,
AVG(sales) OVER (ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) as moving_avg_7d,
SUM(sales) OVER (PARTITION BY region) as region_total
FROM daily_sales;
-- 3. 分析函数
SELECT
product_id,
sale_date,
price,
LAG(price, 1) OVER (PARTITION BY product_id ORDER BY sale_date) as prev_price,
LEAD(price, 1) OVER (PARTITION BY product_id ORDER BY sale_date) as next_price,
FIRST_VALUE(price) OVER (PARTITION BY product_id ORDER BY sale_date) as first_price,
LAST_VALUE(price) OVER (PARTITION BY product_id ORDER BY sale_date RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as last_price
FROM product_prices;
CTE(公用表表达式)
-- 1. 基础 CTE
WITH high_value_customers AS (
SELECT customer_id, SUM(order_amount) as total_spent
FROM orders
GROUP BY customer_id
HAVING SUM(order_amount) > 10000
)
SELECT c.name, h.total_spent
FROM customers c
JOIN high_value_customers h ON c.customer_id = h.customer_id;
-- 2. 递归 CTE(层级查询)
WITH RECURSIVE org_hierarchy AS (
-- 基础查询:获取顶层节点
SELECT employee_id, name, manager_id, 1 as level
FROM employees
WHERE manager_id IS NULL
UNION ALL
-- 递归查询:获取下级员工
SELECT e.employee_id, e.name, e.manager_id, oh.level + 1
FROM employees e
INNER JOIN org_hierarchy oh ON e.manager_id = oh.employee_id
)
SELECT * FROM org_hierarchy
ORDER BY level, employee_id;
性能优化
-- 1. 使用 EXPLAIN 分析查询计划
EXPLAIN ANALYZE
SELECT * FROM orders
WHERE customer_id = 100
AND order_date >= '2024-01-01';
-- 2. 创建合适的索引
CREATE INDEX idx_customer_order ON orders(customer_id, order_date);
-- 3. 避免 SELECT *
SELECT order_id, customer_id, order_date, amount
FROM orders;
-- 4. 使用 EXISTS 代替 IN(大数据量时)
SELECT * FROM customers c
WHERE EXISTS (
SELECT 1 FROM orders o
WHERE o.customer_id = c.customer_id
AND o.amount > 1000
);
-- 5. 优化 JOIN
SELECT o.order_id, c.name, p.product_name
FROM orders o
INNER JOIN customers c ON o.customer_id = c.customer_id
INNER JOIN order_items oi ON o.order_id = oi.order_id
INNER JOIN products p ON oi.product_id = p.product_id
WHERE o.order_date BETWEEN '2024-01-01' AND '2024-12-31';
📈 数据分析实战
用户行为分析
-- 1. 用户留存分析
WITH user_first_order AS (
SELECT
customer_id,
DATE(MIN(order_date)) as first_order_date
FROM orders
GROUP BY customer_id
),
retention AS (
SELECT
ufo.first_order_date,
COUNT(DISTINCT ufo.customer_id) as total_users,
COUNT(DISTINCT CASE
WHEN DATEDIFF(o.order_date, ufo.first_order_date) = 7
THEN ufo.customer_id
END) as day_7_retention,
COUNT(DISTINCT CASE
WHEN DATEDIFF(o.order_date, ufo.first_order_date) = 30
THEN ufo.customer_id
END) as day_30_retention
FROM user_first_order ufo
LEFT JOIN orders o ON ufo.customer_id = o.customer_id
GROUP BY ufo.first_order_date
)
SELECT
first_order_date,
total_users,
ROUND(day_7_retention * 100.0 / total_users, 2) as day_7_retention_rate,
ROUND(day_30_retention * 100.0 / total_users, 2) as day_30_retention_rate
FROM retention
ORDER BY first_order_date;
-- 2. RFM 用户分群
WITH rfm AS (
SELECT
customer_id,
DATEDIFF(CURRENT_DATE, MAX(order_date)) as recency,
COUNT(order_id) as frequency,
SUM(amount) as monetary
FROM orders
GROUP BY customer_id
),
rfm_scores AS (
SELECT
customer_id,
NTILE(4) OVER (ORDER BY recency DESC) as r_score,
NTILE(4) OVER (ORDER BY frequency) as f_score,
NTILE(4) OVER (ORDER BY monetary) as m_score
FROM rfm
)
SELECT
customer_id,
r_score,
f_score,
m_score,
r_score + f_score + m_score as rfm_total,
CASE
WHEN r_score >= 3 AND f_score >= 3 AND m_score >= 3 THEN '重要价值用户'
WHEN r_score >= 3 AND f_score >= 2 THEN '重要发展用户'
WHEN r_score >= 2 AND f_score >= 2 THEN '一般用户'
ELSE '低价值用户'
END as segment
FROM rfm_scores
ORDER BY rfm_total DESC;
销售分析
-- 1. 同期群分析(Cohort Analysis)
WITH cohort_data AS (
SELECT
c.customer_id,
DATE_TRUNC('month', c.registration_date)::date as cohort_date,
DATE_TRUNC('month', o.order_date)::date as activity_date,
EXTRACT(YEAR FROM AGE(o.order_date, c.registration_date)) * 12 +
EXTRACT(MONTH FROM AGE(o.order_date, c.registration_date)) as months_active
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
),
cohort_size AS (
SELECT
cohort_date,
COUNT(DISTINCT customer_id) as cohort_size
FROM cohort_data
GROUP BY cohort_date
),
retention_data AS (
SELECT
cd.cohort_date,
cd.months_active,
COUNT(DISTINCT cd.customer_id) as active_users
FROM cohort_data cd
GROUP BY cd.cohort_date, cd.months_active
)
SELECT
rd.cohort_date,
cs.cohort_size,
rd.months_active,
rd.active_users,
ROUND(rd.active_users * 100.0 / cs.cohort_size, 2) as retention_rate
FROM retention_data rd
JOIN cohort_size cs ON rd.cohort_date = cs.cohort_date
ORDER BY rd.cohort_date, rd.months_active;
-- 2. 销售漏斗分析
WITH funnel AS (
SELECT
'Page View' as stage,
COUNT(DISTINCT session_id) as users,
100.0 as percentage
FROM page_views
UNION ALL
SELECT
'Add to Cart' as stage,
COUNT(DISTINCT session_id) as users,
COUNT(DISTINCT session_id) * 100.0 / (SELECT COUNT(DISTINCT session_id) FROM page_views) as percentage
FROM cart_additions
UNION ALL
SELECT
'Checkout' as stage,
COUNT(DISTINCT session_id) as users,
COUNT(DISTINCT session_id) * 100.0 / (SELECT COUNT(DISTINCT session_id) FROM page_views) as percentage
FROM checkouts
UNION ALL
SELECT
'Purchase' as stage,
COUNT(DISTINCT session_id) as users,
COUNT(DISTINCT session_id) * 100.0 / (SELECT COUNT(DISTINCT session_id) FROM page_views) as percentage
FROM purchases
)
SELECT * FROM funnel
ORDER BY
CASE stage
WHEN 'Page View' THEN 1
WHEN 'Add to Cart' THEN 2
WHEN 'Checkout' THEN 3
WHEN 'Purchase' THEN 4
END;
🔍 常见问题
Q1: 如何处理 NULL 值?
-- 使用 COALESCE 返回第一个非 NULL 值
SELECT
customer_id,
COALESCE(phone, email, 'No Contact') as contact_info
FROM customers;
-- 使用 NULLIF 避免除零错误
SELECT
revenue,
cost,
NULLIF(revenue - cost, 0) as profit
FROM sales;
Q2: 如何优化大表查询?
A:
1. 添加合适的索引
2. 使用分区表
3. 只查询需要的列
4. 使用 LIMIT 限制结果数
5. 避免在 WHERE 子句中对字段使用函数
-- 坏例子
SELECT * FROM orders WHERE YEAR(order_date) = 2024;
-- 好例子
SELECT order_id, customer_id, amount
FROM orders
WHERE order_date >= '2024-01-01'
AND order_date < '2025-01-01';
🔗 更多资源
官方文档
练习平台
性能工具
最后更新: 2026-06-01