跳到主要内容

SQL 分析实战练习

问题

如何将窗口函数、CTE、聚合、子查询等技能综合运用到真实业务场景中?

答案

本文通过 8 个完整的业务场景,将前面章节学到的所有 SQL 分析技能串联起来。每个场景都包含:业务背景、数据表结构、分析思路和完整 SQL。


场景 1:用户留存分析

业务背景

留存率是衡量产品粘性的核心指标。计算注册后第 1 天、第 3 天、第 7 天、第 30 天的留存率。

数据表

-- 用户注册表
CREATE TABLE user_register (
user_id INT PRIMARY KEY,
register_date DATE
);

-- 用户活跃表(每天每个活跃用户一条记录)
CREATE TABLE user_active (
user_id INT,
active_date DATE,
PRIMARY KEY (user_id, active_date)
);

完整 SQL

-- 计算各注册日期的用户在第 1/3/7/30 天的留存率
WITH register_cohort AS (
-- 每个注册日期的注册人数
SELECT register_date, COUNT(*) AS register_count
FROM user_register
GROUP BY register_date
),
retention_data AS (
-- 将注册信息和活跃信息关联,计算活跃天距注册天的差值
SELECT
r.register_date,
DATEDIFF(a.active_date, r.register_date) AS day_diff,
COUNT(DISTINCT a.user_id) AS active_count
FROM user_register r
JOIN user_active a ON r.user_id = a.user_id
WHERE DATEDIFF(a.active_date, r.register_date) IN (1, 3, 7, 30)
GROUP BY r.register_date, DATEDIFF(a.active_date, r.register_date)
)
SELECT
rc.register_date,
rc.register_count,
-- 用条件聚合提取各天的留存人数
MAX(CASE WHEN rd.day_diff = 1 THEN rd.active_count END) AS day1_active,
ROUND(MAX(CASE WHEN rd.day_diff = 1 THEN rd.active_count END)
/ rc.register_count * 100, 2) AS day1_retention_pct,
MAX(CASE WHEN rd.day_diff = 3 THEN rd.active_count END) AS day3_active,
ROUND(MAX(CASE WHEN rd.day_diff = 3 THEN rd.active_count END)
/ rc.register_count * 100, 2) AS day3_retention_pct,
MAX(CASE WHEN rd.day_diff = 7 THEN rd.active_count END) AS day7_active,
ROUND(MAX(CASE WHEN rd.day_diff = 7 THEN rd.active_count END)
/ rc.register_count * 100, 2) AS day7_retention_pct,
MAX(CASE WHEN rd.day_diff = 30 THEN rd.active_count END) AS day30_active,
ROUND(MAX(CASE WHEN rd.day_diff = 30 THEN rd.active_count END)
/ rc.register_count * 100, 2) AS day30_retention_pct
FROM register_cohort rc
LEFT JOIN retention_data rd ON rc.register_date = rd.register_date
GROUP BY rc.register_date, rc.register_count
ORDER BY rc.register_date;
涉及知识点
  • CTE:分步组织中间结果
  • 条件聚合CASE WHEN + MAX 行转列
  • LEFT JOIN:保留所有注册日期(即使缺少某天的留存数据)
  • DATEDIFF:计算日期差值

场景 2:漏斗转化分析

业务背景

电商的核心转化漏斗:浏览商品 → 加入购物车 → 提交订单 → 完成支付。计算每一步的转化率和流失率。

数据表

CREATE TABLE user_events (
user_id INT,
event_type VARCHAR(20), -- 'view', 'add_cart', 'submit_order', 'payment'
event_time DATETIME,
session_id VARCHAR(50)
);

完整 SQL

WITH funnel AS (
SELECT
-- 每个步骤的独立用户数
COUNT(DISTINCT CASE WHEN event_type = 'view' THEN user_id END) AS step1_view,
COUNT(DISTINCT CASE WHEN event_type = 'add_cart' THEN user_id END) AS step2_cart,
COUNT(DISTINCT CASE WHEN event_type = 'submit_order' THEN user_id END) AS step3_order,
COUNT(DISTINCT CASE WHEN event_type = 'payment' THEN user_id END) AS step4_pay
FROM user_events
WHERE event_time BETWEEN '2024-01-01' AND '2024-01-31'
)
SELECT
step1_view AS '浏览人数',
step2_cart AS '加购人数',
ROUND(step2_cart / NULLIF(step1_view, 0) * 100, 2) AS '浏览→加购转化率%',
step3_order AS '下单人数',
ROUND(step3_order / NULLIF(step2_cart, 0) * 100, 2) AS '加购→下单转化率%',
step4_pay AS '支付人数',
ROUND(step4_pay / NULLIF(step3_order, 0) * 100, 2) AS '下单→支付转化率%',
ROUND(step4_pay / NULLIF(step1_view, 0) * 100, 2) AS '整体转化率%'
FROM funnel;
进阶:带时间顺序的严格漏斗

上面的简单漏斗只统计"做过某个事件"的人数,不要求事件有先后顺序。严格漏斗要求事件必须按顺序发生:

-- 严格漏斗:步骤之间有时间先后关系
WITH ordered_events AS (
SELECT user_id, event_type, event_time,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY event_time) AS rn
FROM user_events
),
step1 AS (
SELECT DISTINCT user_id, MIN(event_time) AS t1
FROM ordered_events WHERE event_type = 'view'
GROUP BY user_id
),
step2 AS (
SELECT s1.user_id, MIN(oe.event_time) AS t2
FROM step1 s1
JOIN ordered_events oe ON s1.user_id = oe.user_id
AND oe.event_type = 'add_cart' AND oe.event_time > s1.t1
GROUP BY s1.user_id
),
step3 AS (
SELECT s2.user_id, MIN(oe.event_time) AS t3
FROM step2 s2
JOIN ordered_events oe ON s2.user_id = oe.user_id
AND oe.event_type = 'submit_order' AND oe.event_time > s2.t2
GROUP BY s2.user_id
),
step4 AS (
SELECT s3.user_id
FROM step3 s3
JOIN ordered_events oe ON s3.user_id = oe.user_id
AND oe.event_type = 'payment' AND oe.event_time > s3.t3
GROUP BY s3.user_id
)
SELECT
(SELECT COUNT(*) FROM step1) AS step1_view,
(SELECT COUNT(*) FROM step2) AS step2_cart,
(SELECT COUNT(*) FROM step3) AS step3_order,
(SELECT COUNT(*) FROM step4) AS step4_pay;

场景 3:连续登录天数

业务背景

找出连续登录 7 天及以上的用户,用于发放"连续签到"奖励。

完整 SQL

-- 经典技巧:日期减去行号,连续日期的差值相同
WITH login_ranked AS (
SELECT
user_id,
active_date,
-- 按用户分组,按日期排序编号
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY active_date) AS rn
FROM user_active
),
grouped AS (
SELECT
user_id,
active_date,
-- 核心:日期 - 行号 = 分组标识
-- 连续的日期减去连续的行号,得到的值相同
DATE_SUB(active_date, INTERVAL rn DAY) AS group_date
FROM login_ranked
)
SELECT
user_id,
MIN(active_date) AS start_date, -- 连续登录的开始日期
MAX(active_date) AS end_date, -- 连续登录的结束日期
COUNT(*) AS consecutive_days -- 连续登录天数
FROM grouped
GROUP BY user_id, group_date
HAVING COUNT(*) >= 7
ORDER BY consecutive_days DESC;
差值分组法原理

假设用户 A 在 1月1日~1月5日 连续登录:

active_daternactive_date - rn
01-01112-31
01-02212-31
01-03312-31
01-04412-31
01-05512-31

连续日期减去连续行号 = 相同的结果,因此可以用 GROUP BY group_date 把连续的天分到同一组。


场景 4:RFM 用户分层

业务背景

RFM 模型是经典的用户分层方法:

  • R (Recency):最近一次消费距今天数(越小越好)
  • F (Frequency):消费频次(越高越好)
  • M (Monetary):消费总金额(越高越好)

将每个维度分为高/低两档,得到 8 种用户类型。

完整 SQL

WITH rfm_raw AS (
-- 第一步:计算 R、F、M 原始值
SELECT
user_id,
DATEDIFF(CURDATE(), MAX(order_date)) AS recency,
COUNT(*) AS frequency,
SUM(amount) AS monetary
FROM orders
WHERE order_date >= DATE_SUB(CURDATE(), INTERVAL 1 YEAR)
GROUP BY user_id
),
rfm_scored AS (
-- 第二步:用中位数作为分界点(分为高/低)
SELECT
user_id,
recency,
frequency,
monetary,
CASE WHEN recency <= (SELECT AVG(recency) FROM rfm_raw) THEN 1 ELSE 0 END AS r_score,
CASE WHEN frequency >= (SELECT AVG(frequency) FROM rfm_raw) THEN 1 ELSE 0 END AS f_score,
CASE WHEN monetary >= (SELECT AVG(monetary) FROM rfm_raw) THEN 1 ELSE 0 END AS m_score
FROM rfm_raw
)
SELECT
user_id,
recency,
frequency,
monetary,
r_score,
f_score,
m_score,
-- 第三步:根据 R/F/M 得分分类
CASE
WHEN r_score = 1 AND f_score = 1 AND m_score = 1 THEN '重要价值客户'
WHEN r_score = 1 AND f_score = 0 AND m_score = 1 THEN '重要发展客户'
WHEN r_score = 0 AND f_score = 1 AND m_score = 1 THEN '重要保持客户'
WHEN r_score = 0 AND f_score = 0 AND m_score = 1 THEN '重要挽留客户'
WHEN r_score = 1 AND f_score = 1 AND m_score = 0 THEN '一般价值客户'
WHEN r_score = 1 AND f_score = 0 AND m_score = 0 THEN '一般发展客户'
WHEN r_score = 0 AND f_score = 1 AND m_score = 0 THEN '一般保持客户'
WHEN r_score = 0 AND f_score = 0 AND m_score = 0 THEN '一般挽留客户'
END AS user_segment
FROM rfm_scored
ORDER BY monetary DESC;

场景 5:滑动平均与趋势分析

业务背景

日活跃用户数(DAU)波动较大,使用 7 天滑动平均来平滑趋势,并标记增长/下降趋势。

完整 SQL

WITH daily_stats AS (
SELECT
active_date,
COUNT(DISTINCT user_id) AS dau
FROM user_active
WHERE active_date BETWEEN '2024-01-01' AND '2024-03-31'
GROUP BY active_date
),
with_moving_avg AS (
SELECT
active_date,
dau,
-- 7 天滑动平均(包括当天和前 6 天)
ROUND(AVG(dau) OVER (
ORDER BY active_date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
), 0) AS ma7_dau,
-- 上周同日的 DAU
LAG(dau, 7) OVER (ORDER BY active_date) AS last_week_dau
FROM daily_stats
)
SELECT
active_date,
dau,
ma7_dau,
last_week_dau,
-- 周同比增长率
ROUND((dau - last_week_dau) / NULLIF(last_week_dau, 0) * 100, 2) AS wow_growth_pct,
-- 趋势判断
CASE
WHEN ma7_dau > LAG(ma7_dau, 1) OVER (ORDER BY active_date) THEN '📈 上升'
WHEN ma7_dau < LAG(ma7_dau, 1) OVER (ORDER BY active_date) THEN '📉 下降'
ELSE '➡️ 持平'
END AS trend
FROM with_moving_avg
ORDER BY active_date;

场景 6:同期群分析(Cohort Analysis)

业务背景

按用户的首单月份分组(同期群),分析每个群体在后续各月的复购情况。

完整 SQL

WITH first_order AS (
-- 第一步:找到每个用户的首单月份
SELECT
user_id,
DATE_FORMAT(MIN(order_date), '%Y-%m') AS cohort_month
FROM orders
GROUP BY user_id
),
monthly_orders AS (
-- 第二步:每个用户每月的订单情况
SELECT
o.user_id,
f.cohort_month,
DATE_FORMAT(o.order_date, '%Y-%m') AS order_month,
-- 计算距首单的月数
PERIOD_DIFF(
DATE_FORMAT(o.order_date, '%Y%m'),
DATE_FORMAT(STR_TO_DATE(CONCAT(f.cohort_month, '-01'), '%Y-%m-%d'), '%Y%m')
) AS month_offset
FROM orders o
JOIN first_order f ON o.user_id = f.user_id
)
SELECT
cohort_month AS '首单月份',
COUNT(DISTINCT CASE WHEN month_offset = 0 THEN user_id END) AS 'M0 (新客)',
COUNT(DISTINCT CASE WHEN month_offset = 1 THEN user_id END) AS 'M1',
COUNT(DISTINCT CASE WHEN month_offset = 2 THEN user_id END) AS 'M2',
COUNT(DISTINCT CASE WHEN month_offset = 3 THEN user_id END) AS 'M3',
-- M1 复购率
ROUND(COUNT(DISTINCT CASE WHEN month_offset = 1 THEN user_id END)
/ NULLIF(COUNT(DISTINCT CASE WHEN month_offset = 0 THEN user_id END), 0) * 100, 2) AS 'M1复购率%',
-- M2 复购率
ROUND(COUNT(DISTINCT CASE WHEN month_offset = 2 THEN user_id END)
/ NULLIF(COUNT(DISTINCT CASE WHEN month_offset = 0 THEN user_id END), 0) * 100, 2) AS 'M2复购率%',
-- M3 复购率
ROUND(COUNT(DISTINCT CASE WHEN month_offset = 3 THEN user_id END)
/ NULLIF(COUNT(DISTINCT CASE WHEN month_offset = 0 THEN user_id END), 0) * 100, 2) AS 'M3复购率%'
FROM monthly_orders
GROUP BY cohort_month
ORDER BY cohort_month;
同期群分析的价值
  1. 对比不同批次用户:哪个月获取的用户质量更高?
  2. 衡量产品改进效果:产品改版后新用户的留存率是否提升?
  3. 预测 LTV:根据历史各月复购率曲线,预估客户终身价值

场景 7:排行榜与 TopN

业务背景

计算每个商品类目下的销售 Top 3,以及每个类目的销售占比。

完整 SQL

WITH category_product_sales AS (
-- 每个类目每个商品的总销售额
SELECT
p.category,
p.product_name,
SUM(o.amount) AS total_sales
FROM orders o
JOIN products p ON o.product_id = p.id
WHERE o.order_date BETWEEN '2024-01-01' AND '2024-12-31'
GROUP BY p.category, p.product_name
),
ranked AS (
-- 在每个类目内排名
SELECT
category,
product_name,
total_sales,
ROW_NUMBER() OVER (PARTITION BY category ORDER BY total_sales DESC) AS rk,
-- 类目总销售额
SUM(total_sales) OVER (PARTITION BY category) AS category_total,
-- 全局总销售额
SUM(total_sales) OVER () AS global_total
FROM category_product_sales
)
SELECT
category AS '类目',
product_name AS '商品',
total_sales AS '销售额',
rk AS '排名',
ROUND(total_sales / category_total * 100, 2) AS '类目内占比%',
ROUND(category_total / global_total * 100, 2) AS '类目全局占比%'
FROM ranked
WHERE rk <= 3
ORDER BY category, rk;

场景 8:异常数据检测

业务背景

检测订单数据中的异常:金额异常(超过均值 3 个标准差)、重复订单、时间异常等。

完整 SQL

-- 1. 金额异常检测:Z-Score 方法
WITH stats AS (
SELECT AVG(amount) AS avg_amount, STDDEV(amount) AS std_amount FROM orders
)
SELECT o.*,
ROUND((o.amount - s.avg_amount) / NULLIF(s.std_amount, 0), 2) AS z_score
FROM orders o, stats s
WHERE ABS((o.amount - s.avg_amount) / NULLIF(s.std_amount, 0)) > 3
ORDER BY z_score DESC;

-- 2. 重复订单检测:同一用户短时间内的相同金额订单
SELECT
a.id AS order_id_1,
b.id AS order_id_2,
a.user_id,
a.amount,
a.order_date AS time_1,
b.order_date AS time_2,
TIMESTAMPDIFF(MINUTE, a.order_date, b.order_date) AS diff_minutes
FROM orders a
JOIN orders b ON a.user_id = b.user_id
AND a.amount = b.amount
AND a.id < b.id
AND TIMESTAMPDIFF(MINUTE, a.order_date, b.order_date) < 5;

-- 3. 环比异常:日销售额突然下降/激增超过 50%
WITH daily_sales AS (
SELECT
DATE(order_date) AS sale_date,
SUM(amount) AS daily_amount
FROM orders
GROUP BY DATE(order_date)
),
with_change AS (
SELECT
sale_date,
daily_amount,
LAG(daily_amount) OVER (ORDER BY sale_date) AS prev_amount,
ROUND((daily_amount - LAG(daily_amount) OVER (ORDER BY sale_date))
/ NULLIF(LAG(daily_amount) OVER (ORDER BY sale_date), 0) * 100, 2) AS change_pct
FROM daily_sales
)
SELECT * FROM with_change
WHERE ABS(change_pct) > 50
ORDER BY sale_date;

知识点综合索引

场景核心知识点难度
用户留存分析CTE + DATEDIFF + 条件聚合 + LEFT JOIN⭐⭐⭐
漏斗转化分析条件聚合 + NULLIF + CTE 链式⭐⭐ ~ ⭐⭐⭐⭐
连续登录天数窗口函数 ROW_NUMBER + 差值分组法⭐⭐⭐⭐
RFM 用户分层CTE + 子查询 + CASE WHEN⭐⭐⭐
滑动平均窗口函数 AVG + ROWS BETWEEN + LAG⭐⭐⭐
同期群分析CTE + PERIOD_DIFF + 条件聚合⭐⭐⭐⭐
TopN 排行榜ROW_NUMBER + PARTITION BY + 占比⭐⭐⭐
异常数据检测自连接 + Z-Score + LAG 环比⭐⭐⭐

相关链接