@
brader 我试了下,大致有以下结论:
1. 即使我用 text 类型存 "1,2,3,…"( latin1 编码),无论是 700 / 7000 字节,find_in_set 都是 index
看来 MySQL 还不足够聪明优化 find_in_set
我还以为会生成个临时表,然后 FROM b JOIN a ON
a.id IN tmp_table_of_a_ids 呢。。
2. JSON_TABLE 根据 "1,2,3,…" 生成表,再 join 表 a 是 eq_ref ,效率看来不错
3. 表 b 即使插入两行有 7000 字节的 ids 的行记录,也都在同一页( 16KB ),不用担心查找溢出页导致的效率问题
4. 你说的方法二(新增 C 表),我试了下,插入一千万行(a_id, b_id),磁盘占用 272.8 MB ,平均每行占用约 29 字节
看了下书,主要是每行数据额外记录了(记录头信息 5 字节 + 事务 ID 列 6 字节+ 回滚指针列 7 字节)= 20 字节,然后才是 (int, int) 的 8 字节,所以 C 表其实空间利用率很低
若是用 "1,2,3,…" 存储,即使每个 a_id 是 8 个数字+1 个逗号,一千万个记录也才 9 * (10 ^ 7) / (1 << 20) ≈ 85.8 MB
可即使是 85.8 MB ,楼主也说“计算量小,但是耗地方”
所以 C 表更不符合楼主要求
SET SESSION group_concat_max_len = 8192;
SET SESSION cte_max_recursion_depth = 2048;
CREATE TABLE `a` (
`id` int NOT NULL AUTO_INCREMENT,
`time` date NOT NULL,
`data` int NOT NULL,
`user` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
PRIMARY KEY (`id`)
) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic;
CREATE TABLE `b` (
`id` int NOT NULL AUTO_INCREMENT,
`user` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
`sum_data` int NOT NULL,
`a_ids` text CHARACTER SET latin1 COLLATE latin1_swedish_ci NOT NULL,
PRIMARY KEY (`id`)
) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic;
INSERT INTO a (time, data, user)
WITH RECURSIVE
generate_series(i) AS (
SELECT 0
UNION ALL
SELECT i+1 FROM generate_series WHERE i < 1600
)
SELECT DATE_ADD('2021-01-01', INTERVAL i DAY), i + 1, 'a'
FROM generate_series
WHERE i < 199 -- ids 长度为 2*9(1~9,) + 3*90(10~99,) + 4*100(100~199,) -1(末尾逗号) = 687 B
UNION ALL
SELECT DATE_ADD('2021-01-01', INTERVAL i DAY), i + 1, 'b'
FROM generate_series
WHERE i <= 1760 - 200 -- ids 长度为 4*800(200~999,) + 5*761(1000~1760,) -1 = 7004 B
UNION ALL
SELECT DATE_ADD('2021-01-01', INTERVAL i DAY), i + 1, 'c'
FROM generate_series
WHERE i <= 3240 - 1761; -- ids 长度为 5*1480(1761~3240,) -1 = 7399 B
INSERT INTO b (user, sum_data, a_ids)
SELECT user, sum(data), GROUP_CONCAT(id)
FROM a
GROUP BY user;
EXPLAIN
SELECT
a.id FROM b
JOIN a ON FIND_IN_SET(
a.id, b.a_ids)
WHERE b.user = 'c';
EXPLAIN
SELECT a.*
FROM b,
JSON_TABLE(
CONCAT('[', b.a_ids, ']'),
'$[*]' COLUMNS (id INT PATH '$')
) AS ids
JOIN a USING(id)
WHERE b.user = 'c';
C 表测试:
SET SESSION cte_max_recursion_depth = 1 << 31;
CREATE TABLE `c` (
`a_id` int NOT NULL,
`b_id` int NOT NULL,
PRIMARY KEY (`a_id`, `b_id`)
) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic;
INSERT INTO c
WITH RECURSIVE
generate_series(i) AS (
SELECT 1
UNION ALL
SELECT i+1 FROM generate_series WHERE i <= 10000000
)
SELECT i, i
FROM generate_series;