headers.py
,内含一个名为 headers 的字典,因为涉及隐私,不共享。可以把自己的请求头贴进去。get_danmaku.py
,获取指定 oid(cid)和 date 的弹幕。get_all_history_danmaku.py
,获取所有历史弹幕。get_all_history_danmaku.py
,会生成一个 json。#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Load bilibili history danmaku, and return json.
import requests
import logging as log
import json
import time
from lxml import etree
from .headers import headers
def get_history_danmaku(oid, date):
dm_list = []
# get data
url = f'https://api.bilibili.com/x/v2/dm/history'
params = {'type': 1, 'oid': oid, 'date': date}
r = requests.get(url, params=params, headers=headers)
content = r.content
log.debug(content.decode('utf-8'))
# read xml
xml = etree.HTML(content)
for d in xml.xpath('//d'):
attrs = d.xpath('./@p')[0]
attrs = attrs.split(',')
text = d.xpath('./text()')[0]
log.debug(f'{attrs}, {text}')
# format data
d = {
'cid': int(oid),
'time': int(float(attrs[0])), # 发送时间点(视频播放点)
'position': int(attrs[1]), # 弹幕位置
'fontsize': int(attrs[2]), # 字体大小
'color': ('000000' + str(hex(int(attrs[3])))[2:])[-6:], # 弹幕颜色
'ctime': int(attrs[4]), # 弹幕创建时间
'unknown': attrs[5],
'author': attrs[6], # 发送者编号(不同于 uid )
'dmid': int(attrs[7]), # 弹幕 id
'content': text, # 弹幕内容
'date': date,
'updateTime': int(time.time())
}
dm_list.append(d)
return dm_list
if __name__ == '__main__':
log.basicConfig(level=log.DEBUG)
oid = 136870419
date = '2019-12-20'
d = get_history_danmaku(oid, date)
print(json.dumps(d, ensure_ascii=False, indent=4))
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
import logging as log
import os
import re
import json
from datetime import datetime, timedelta
from .get_danmaku import get_history_danmaku
from .headers import headers
def get_all_history_danmaku(aid):
url = f'https://www.bilibili.com/video/av{aid}'
body = requests.get(url).text
log.debug(body)
# get cid / oid
pages = re.findall(r'(?<="pages":)\[.*?\]', body)[0]
cids = re.findall(r'(?<="cid":)\d*', pages)
log.info(f'{cids=}')
# get post date
publish = re.findall(r'(?<=Published" content=")\d{4}-\d{2}-\d{2}', body)[0]
start_date = datetime.strptime(publish, '%Y-%m-%d')
log.info(f'{publish=}')
result = {}
while True:
date = start_date.strftime('%Y-%m-%d')
log.info(f'get danmaku of {date}')
for cid in cids:
dms = get_history_danmaku(cid, date) # get danmaku
for dm in dms: # format data
dmid = dm['dmid']
result.setdefault(dmid, dm) # 防止重复添加
# go next day or exit
start_date += timedelta(1)
if start_date > datetime.now():
break
here = os.path.abspath(os.path.dirname(__file__))
output = os.path.join(here, f'av{aid}_dm.json')
with open(output, 'w', encoding='utf-8') as f:
f.write(json.dumps(result, ensure_ascii=False, indent=2))
if __name__ == '__main__':
log.basicConfig(level=log.INFO)
aid = 79974337
get_all_history_danmaku(aid)
这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。
V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。
V2EX is a community of developers, designers and creative people.