弹幕格式
首先需要找到弹幕文件,分析格式。
查找弹幕文件
先随便打开一个视频,打开控制台Network查看,复制一个弹幕搜索一下:
没匹配,可能弹幕文件有加密,或者有特殊编码。
清除一些无关的文件,再点击加载弹幕,只剩下几个seg.so和一个web文件。
查看这个seg.so
看来是一个二进制文件,直接预览出现乱码,但是可以看见一部分的弹幕内容,已经可以确定这个就是弹幕文件了,但是有许多乱码在里面,不过仍可以查看部分弹幕内容。
分析
乱码的可能原因有两种,一是弹幕文件进行了加密,二是弹幕文件进行了某种编码,类似protobuf,并不是普通的文本文件。
由于我们可以看见部分内容,所以可以排除加密这种可能性,这个视频文件应该是进行了某种格式的编码。
源码跟踪
对这个弹幕文件的编码进行追踪,查看一下调用的栈:
找到几个可疑的调用,看一下:
loadDmPbAll
loadDmPb
fetchDmSeg
在fetchDmSeg
发现有一个dr(this.rootPlayer).r(null,...)
的方法,像是请求一个url,并有重试次数,继续跟进:
dr.r
发现一个重要的点:e.pb2Json.toJson
,明显是一个bytes
转json的一个方法,跟进。
e.pb2Json.toJson
静态找不到代码,下断点进入:
有一个r.decode
!断点进去
还是decode
继续进去,重点来了:
可以确定,这就是弹幕解码的代码。可以开始使用进行弹幕解码了。
解析实现
基于python对弹幕文件进行解析。
decode
// js源码
// 解析弹幕文件
function DmSegMobileReply$decode(r, l) {
if (!(r instanceof Reader))
r = Reader.create(r)
var c = l === undefined ? r.len : r.pos + l, m = new this.ctor
while (r.pos < c) {
var t = r.uint32()
switch (t >>> 3) {
case 1:
if (!(m.elems && m.elems.length))
m.elems = []
m.elems.push(types[0].decode(r, r.uint32()))
break
default:
r.skipType(t & 7)
break
}
}
return m
}
# python实现
def decode(self, buffer: bytes):
r = BufferReader(buffer)
elms = []
while r.pos < r.len:
# print(f'decode pos: {r.pos}')
t = r.uint32()
t3 = t >> 3
# print(f'decode t>>3: {t3}')
if t3 == 1:
elms.append(self.decodeElem(r, r.uint32()))
else:
r.skipType(t & 7)
return elms
Reader
有一个Reader
,作用是用于读取bytes
,也就是代码文件的字节,值得注意的是,编码中的数字采用的是小端模式(踩坑经验),这部分代码较长,只附上核心部分:
// js源码
u.prototype.uint32 = (l = 4294967295,
function () {
if (l = (127 & this.buf[this.pos]) >>> 0,
this.buf[this.pos++] < 128)
return l;
if (l = (l | (127 & this.buf[this.pos]) << 7) >>> 0,
this.buf[this.pos++] < 128)
return l;
if (l = (l | (127 & this.buf[this.pos]) << 14) >>> 0,
this.buf[this.pos++] < 128)
return l;
if (l = (l | (127 & this.buf[this.pos]) << 21) >>> 0,
this.buf[this.pos++] < 128)
return l;
if (l = (l | (15 & this.buf[this.pos]) << 28) >>> 0,
this.buf[this.pos++] < 128)
return l;
if ((this.pos += 5) > this.len)
throw this.pos = this.len,
s(this, 10);
return l
}
),
u.prototype.int32 = function () {
return 0 | this.uint32()
}
,
u.prototype.bytes = function () {
var e = this.uint32()
, t = this.pos
, r = this.pos + e;
if (r > this.len)
throw s(this, e);
return this.pos += e,
Array.isArray(this.buf) ? this.buf.slice(t, r) : t === r ? new this.buf.constructor(0) : this._slice.call(this.buf, t, r)
}
,
u.prototype.string = function () {
var e = this.bytes();
return a.read(e, 0, e.length)
}
,
u.prototype.skip = function (e) {
if ("number" == typeof e) {
if (this.pos + e > this.len)
throw s(this, e);
this.pos += e
} else
do {
if (this.pos >= this.len)
throw s(this)
} while (128 & this.buf[this.pos++]);
return this
}
,
u.prototype.skipType = function (e) {
switch (e) {
case 0:
this.skip();
break;
case 1:
this.skip(8);
break;
case 2:
this.skip(this.uint32());
break;
case 3:
for (; 4 != (e = 7 & this.uint32());)
this.skipType(e);
break;
case 5:
this.skip(4);
break;
default:
throw Error("invalid wire type " + e + " at offset " + this.pos)
}
return this
}
# python实现
class BufferReader(object):
def __init__(self, buffer: bytes):
self.buf = buffer
self.len = len(self.buf)
self.pos = 0
def skip(self, step=-1):
if step >= 0:
self.pos += step
else:
while True:
if self.pos >= self.len:
raise Exception(self)
if 128 & self.buf[self.pos] == 0:
self.pos += 1
break
self.pos += 1
def skipType(self, t):
if t == 0:
self.skip()
elif t == 1:
self.skip(8)
elif t == 2:
self.skip(self.uint32())
elif t == 3:
while True:
e = 7 & self.uint32()
if e != 4:
self.skipType(e)
else:
break
elif t == 5:
self.skip(4)
else:
raise Exception("invalid wire type " + str(t) + " at offset " + str(self.pos))
def uint32(self):
rv = 0
for i in range(5):
# 小端模式
rv = rv | (127 & self.buf[self.pos]) << (i * 7)
if self.buf[self.pos] < 128:
self.pos += 1
break
self.pos += 1
return rv
def int32(self):
return 0 | self.uint32()
def uint64(self):
rv = 0
for i in range(11):
# 小端模式
rv = rv | (127 & self.buf[self.pos]) << (i * 7)
if self.buf[self.pos] < 128:
self.pos += 1
break
self.pos += 1
return rv
def int64(self):
return 0 | self.uint64()
def string(self):
size = self.uint32()
data = self.buf[self.pos:self.pos + size]
self.pos += size
return str(data.decode('utf8', errors='ignore'))
decodeElem
还有有一个decode
方法用于解析单个弹幕元素:
// js源码
// 解析某个弹幕元素代码
function DanmakuElem$decode(r, l) {
if (!(r instanceof Reader))
r = Reader.create(r)
var c = l === undefined ? r.len : r.pos + l, m = new this.ctor
while (r.pos < c) {
var t = r.uint32()
switch (t >>> 3) {
case 2:
m.stime = r.int32()
break
case 3:
m.mode = r.int32()
break
case 4:
m.size = r.int32()
break
case 5:
m.color = r.uint32()
break
case 6:
m.uhash = r.string()
break
case 7:
m.text = r.string()
break
case 8:
m.date = r.int64()
break
case 9:
m.weight = r.int32()
break
case 10:
m.action = r.string()
break
case 11:
m.pool = r.int32()
break
case 12:
m.dmid = r.string()
break
case 13:
m.attr = r.int32()
break
case 22:
m.animation = r.string()
break
default:
r.skipType(t & 7)
break
}
}
return m
}
# python实现
def decodeElem(r: BufferReader, eLen: int):
elem = {}
end = r.pos + eLen
while r.pos < end:
t = r.uint32()
t3 = t >> 3
# print(f'decodeElem pos: {r.pos}, t: {t}, type: {t3}')
if t3 == 2:
elem['stime'] = r.int32()
elif t3 == 3:
elem['mode'] = r.int32()
elif t3 == 4:
elem['size'] = r.int32()
elif t3 == 5:
elem['color'] = r.uint32()
elif t3 == 6:
elem['uhash'] = r.string()
elif t3 == 7:
elem['text'] = r.string()
elif t3 == 8:
elem['date'] = r.int64()
elif t3 == 9:
elem['weight'] = r.int32()
elif t3 == 10:
elem['action'] = r.string()
elif t3 == 11:
elem['pool'] = r.int32()
elif t3 == 12:
elem['dmid'] = r.string()
elif t3 == 13:
elem['attr'] = r.int32()
elif t3 == 22:
elem['animation'] = r.string()
else:
r.skipType(t & 7)
return elem
运行
至此,代码已经完全完成了,随便取一个弹幕文件试一下吧
api.bilibili.com/x/v2/dm/web/seg.so?type=1&oid=745899804&pid=638907823&segment_index=1&pull_mode=1&ps=120000&pe=360000
解析成功!