@
ComplexPug #14 我用 Lua 写了个(可限制内存)词频统计,并和 DuckDB 比了下速度,感觉还行。
1. 测试数据
[知乎回答](
https://www.zhihu.com/answer/1906560411 ) 里,分享的《英文维基百科(仅文本版)》,并简单用脚本预处理了下(去标点、每词一行、转小写)。
解压后:13.33 GB
处理后:12.84 GB ,23 亿词(不重复的有 854W )
2. 测试结果
- DuckDB (不限制内存):4 分半( 1.0X )
- Lua(最多存100M文本):7 分半( 1.6X ),01 个共 103MB 临时文件
- Lua(最多存 10M文本):8 分钟( 1.7X ),23 个共 306MB 临时文件
3. 测试硬件:i5-8250U 低压 CPU 轻薄本。。
4. 运行截图
5. 预处理脚本
```shell
unzip -p enwiki_text.zip | tr -cs "[:alnum:]'-" '\n' | tr [:upper:] [:lower:]
```
6. Lua 脚本( V 站吞空格,所以将行首缩进,都转为全角空格了)
```lua
#!luajit
local TOP_NUM = 1000
local MAX_SIZE = tonumber(arg[1] or 16) * 2 ^ 20
local dict = {}
local dict_size = 0
local files = {}
local words = {}
local word = ''
local freq = 0
function heap_swap(heap, comp, lhs, rhs, func, next)
if comp(heap[lhs], heap[rhs]) then
heap[lhs], heap[rhs] = heap[rhs], heap[lhs]
func(heap, comp, next)
end
end
function heap_up(heap, comp, idx)
if idx > 1 then
local pa = math.floor(idx / 2)
heap_swap(heap, comp, idx, pa, heap_up, pa)
end
end
function heap_down(heap, comp, idx)
if idx <= #heap / 2 then
local left, right = idx * 2, idx * 2 + 1
local son = right > #heap and left or
(comp(heap[left], heap[right]) and left or right)
heap_swap(heap, comp, son, idx, heap_down, son)
end
end
function heap_push(heap, comp, val)
table.insert(heap, val)
heap_up(heap, comp, #heap)
end
function heap_pop(heap, comp)
heap[1] = heap[#heap]
table.remove(heap)
heap_down(heap, comp, 1)
end
function heap_adjust(heap, comp, idx)
heap_down(heap, comp, idx)
heap_up(heap, comp, idx)
end
function comp(lhs, rhs)
return lhs[1] < rhs[1]
end
function sorted_keys(dict)
local keys = {}
for k in pairs(dict) do
table.insert(keys, k)
end
table.sort(keys)
return keys
end
function push_word()
if #word > 0 then
if #words < TOP_NUM then
heap_push(words, comp, {freq, word})
elseif freq > words[1][1] then
words[1] = {freq, word}
heap_adjust(words, comp, 1)
end
end
end
function try_save_dict(new_key)
dict_size = dict_size + (new_key and #new_key or 0)
if not new_key or dict_size > MAX_SIZE then
local file = io.tmpfile()
for idx, key in ipairs(sorted_keys(dict)) do
file:write(dict[key], ' ', key, '\n')
end
dict = {}
file:seek('set')
dict_size = new_key and #new_key
table.insert(files, {'', 0, file})
end
end
for word in io.lines() do
dict[word] = (dict[word] or try_save_dict(word) or 0) + 1
end
try_save_dict()
while #files > 0 do
local file, read_ok = files[1]
if word == file[1] then
freq = freq + file[2]
else
push_word()
word, freq = file[1], file[2]
end
file[2], read_ok, file[1] = file[3]:read('n', 1, 'l');
(read_ok and heap_adjust or heap_pop)(files, comp, 1)
end
push_word()
table.sort(words, comp)
for i = #words, 1, -1 do
io.write(string.format('%d %s\n', words[i][1], words[i][2]))
end
```