关于小米的拆机评测,顺手写了个 B 站弹幕爬虫范例。

2020-01-17 16:39:28 +08:00
 JCZ2MkKb5S8ZX9pq

结构

用法

文件

get_danmaku.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Load bilibili history danmaku, and return json.

import requests
import logging as log
import json
import time
from lxml import etree
from .headers import headers


def get_history_danmaku(oid, date):
    dm_list = []

    # get data
    url = f'https://api.bilibili.com/x/v2/dm/history'
    params = {'type': 1, 'oid': oid, 'date': date}
    r = requests.get(url, params=params, headers=headers)
    content = r.content
    log.debug(content.decode('utf-8'))

    # read xml
    xml = etree.HTML(content)
    for d in xml.xpath('//d'):
        attrs = d.xpath('./@p')[0]
        attrs = attrs.split(',')
        text = d.xpath('./text()')[0]
        log.debug(f'{attrs}, {text}')

        # format data
        d = {
            'cid': int(oid),
            'time': int(float(attrs[0])),  # 发送时间点(视频播放点)
            'position': int(attrs[1]),  # 弹幕位置
            'fontsize': int(attrs[2]),  # 字体大小
            'color': ('000000' + str(hex(int(attrs[3])))[2:])[-6:],  # 弹幕颜色
            'ctime': int(attrs[4]),  # 弹幕创建时间
            'unknown': attrs[5],
            'author': attrs[6],  # 发送者编号(不同于 uid )
            'dmid': int(attrs[7]),  # 弹幕 id
            'content': text,  # 弹幕内容
            'date': date,
            'updateTime': int(time.time())
        }
        dm_list.append(d)

    return dm_list


if __name__ == '__main__':

    log.basicConfig(level=log.DEBUG)

    oid = 136870419
    date = '2019-12-20'
    d = get_history_danmaku(oid, date)
    print(json.dumps(d, ensure_ascii=False, indent=4))

get_all_history_danmaku.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import requests
import logging as log
import os
import re
import json
from datetime import datetime, timedelta
from .get_danmaku import get_history_danmaku
from .headers import headers


def get_all_history_danmaku(aid):
    url = f'https://www.bilibili.com/video/av{aid}'
    body = requests.get(url).text
    log.debug(body)

    # get cid / oid
    pages = re.findall(r'(?<="pages":)\[.*?\]', body)[0]
    cids = re.findall(r'(?<="cid":)\d*', pages)
    log.info(f'{cids=}')

    # get post date
    publish = re.findall(r'(?<=Published" content=")\d{4}-\d{2}-\d{2}', body)[0]
    start_date = datetime.strptime(publish, '%Y-%m-%d')
    log.info(f'{publish=}')

    result = {}
    while True:
        date = start_date.strftime('%Y-%m-%d')
        log.info(f'get danmaku of {date}')
        for cid in cids:
            dms = get_history_danmaku(cid, date)  # get danmaku
            for dm in dms:  # format data
                dmid = dm['dmid']
                result.setdefault(dmid, dm)  # 防止重复添加

        # go next day or exit
        start_date += timedelta(1)
        if start_date > datetime.now():
            break

    here = os.path.abspath(os.path.dirname(__file__))
    output = os.path.join(here, f'av{aid}_dm.json')
    with open(output, 'w', encoding='utf-8') as f:
        f.write(json.dumps(result, ensure_ascii=False, indent=2))


if __name__ == '__main__':

    log.basicConfig(level=log.INFO)

    aid = 79974337
    get_all_history_danmaku(aid)


3452 次点击
所在节点    Python
5 条回复
NSAgold
2020-01-17 19:48:16 +08:00
没记错是弹幕池,区分是否是高级弹幕用的
JCZ2MkKb5S8ZX9pq
2020-01-17 19:49:44 +08:00
@NSAgold 原来如此,我等下找个视频验证一下,非常感谢。
这个字段在我这儿躺了至少两年了……
JCZ2MkKb5S8ZX9pq
2020-01-17 19:53:50 +08:00
@NSAgold
试了下这个视频
https://www.bilibili.com/video/av61919487

好像并不是,高级弹幕的这个字段仍旧是 0,但内容部分是数组的形式。
Va1n3R
2020-01-19 01:47:09 +08:00
弹幕 aid 被 hash 过,能彩虹表枚举出来的。
JCZ2MkKb5S8ZX9pq
2020-01-19 13:04:06 +08:00
@Va1n3R 那可以拿评论最多的前十和弹幕最多的前十撞一下试试,请问用的是哪种 hash ?

这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。

https://www.v2ex.com/t/638795

V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。

V2EX is a community of developers, designers and creative people.

© 2021 V2EX