用Python分析《令人心动的offer2》的13万条弹幕,网友们都在吐槽什么?
天作之程
共 6223字,需浏览 13分钟
·
2020-12-08 08:33
前言
数据获取
#-*- coding = uft-8 -*-
#@Time : 2020/11/30 21:35
#@Author : 公众号 菜J学Python
#@File : tengxun_danmu.py
import requests
import json
import time
import pandas as pd
target_id = "6130942571%26" #面试篇的target_id
vid = "%3Dt0034o74jpr" #面试篇的vid
df = pd.DataFrame()
for page in range(15, 3214, 30): #视频时长共3214秒
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
url = 'https://mfm.video.qq.com/danmu?otype=json×tamp={0}&target_id={1}vid{2}&count=80'.format(page,target_id,vid)
print("正在提取第" + str(page) + "页")
html = requests.get(url,headers = headers)
bs = json.loads(html.text,strict = False) #strict参数解决部分内容json格式解析报错
time.sleep(1)
#遍历获取目标字段
for i in bs['comments']:
content = i['content'] #弹幕
upcount = i['upcount'] #点赞数
user_degree =i['uservip_degree'] #会员等级
timepoint = i['timepoint'] #发布时间
comment_id = i['commentid'] #弹幕id
cache = pd.DataFrame({'弹幕':[content],'会员等级':[user_degree],'发布时间':[timepoint],'弹幕点赞':[upcount],'弹幕id':[comment_id]})
df = pd.concat([df,cache])
df.to_csv('面试篇.csv',encoding = 'utf-8')
数据清洗
合并弹幕数据
import pandas as pd
import numpy as np
df1 = pd.read_csv("/菜J学Python/弹幕/腾讯/令人心动的offer/面试篇.csv")
df1["期数"] = "面试篇"
df2 = pd.read_csv("/菜J学Python/弹幕/腾讯/令人心动的offer/第1期.csv")
df2["期数"] = "第1期"
df3 = pd.read_csv("/菜J学Python/弹幕/腾讯/令人心动的offer/第2期.csv")
df3["期数"] = "第2期"
df4 = pd.read_csv("/菜J学Python/弹幕/腾讯/令人心动的offer/第3期.csv")
df4["期数"] = "第3期"
df = pd.concat([df1,df2,df3,df4])
df.sample(10)
查看数据信息
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 133627 entries, 0 to 34923
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 133627 non-null int64
1 用户名 49040 non-null object
2 内容 133626 non-null object
3 会员等级 133627 non-null int64
4 评论时间点 133627 non-null int64
5 评论点赞 133627 non-null int64
6 评论id 133627 non-null int64
7 期数 133627 non-null object
dtypes: int64(5), object(3)
memory usage: 9.2+ MB
重命名字段
df = df.rename(columns={'用户名':'用户昵称','内容':'弹幕内容','评论时间点':'发送时间','评论点赞':'弹幕点赞','期数':'所属期数'})
过滤字段
#选择需要分析的字段
df = df[["用户昵称","弹幕内容","会员等级","发送时间","弹幕点赞","所属期数"]]
缺失值处理
df["用户昵称"] = df["用户昵称"].fillna("无名氏")
发送时间处理
def time_change(seconds):
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
ss_time = "%d:%02d:%02d" % (h, m, s)
print(ss_time)
return ss_time
time_change(seconds=8888)
df["发送时间"] = df["发送时间"].apply(time_change)
df['发送时间'] = pd.to_datetime(df['发送时间'])
df['发送时间'] = df['发送时间'].apply(lambda x : x.strftime('%H:%M:%S'))
弹幕内容处理
df["弹幕内容"] = df["弹幕内容"].astype("str")
#定义机械压缩函数
def yasuo(st):
for i in range(1,int(len(st)/2)+1):
for j in range(len(st)):
if st[j:j+i] == st[j+i:j+2*i]:
k = j + i
while st[k:k+i] == st[k+i:k+2*i] and kk = k + i
st = st[:j] + st[k:]
return st
yasuo(st="菜J学Python真的真的真的很菜很菜")
#调用机械压缩函数
df["弹幕内容"] = df["弹幕内容"].apply(yasuo)
df['弹幕内容'] = df['弹幕内容'].str.extract(r"([\u4e00-\u9fa5]+)") #提取中文内容
df = df.dropna() #纯表情弹幕直接删除
数据分析
各期弹幕数量对比
import pyecharts.options as opts
from pyecharts.charts import *
from pyecharts.globals import ThemeType
df7 = df["所属期数"].value_counts()
print(df7.index.to_list())
print(df7.to_list())
c = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.DARK))
.add_xaxis(df7.index.to_list())
.add_yaxis("",df7.to_list())
.set_global_opts(title_opts=opts.TitleOpts(title="各期弹幕数量",subtitle="数据来源:腾讯视屏 \t制图:菜J学Python",pos_left = 'left'),
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(font_size=13)), #更改横坐标字体大小
yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(font_size=13)), #更改纵坐标字体大小
)
.set_series_opts(label_opts=opts.LabelOpts(font_size=16,position='top'))
)
c.render_notebook()
谁是弹幕发射机
df8 = df["用户昵称"].value_counts()[1:11]
df8 = df8.sort_values(ascending=True)
df8 = df8.tail(10)
print(df8.index.to_list())
print(df8.to_list())
c = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.DARK))
.add_xaxis(df8.index.to_list())
.add_yaxis("",df8.to_list()).reversal_axis() #X轴与y轴调换顺序
.set_global_opts(title_opts=opts.TitleOpts(title="弹幕发送数量TOP10",subtitle="数据来源:腾讯视频 \t制图:菜J学Python",pos_left = 'left'),
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(font_size=13)), #更改横坐标字体大小
yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(font_size=13)), #更改纵坐标字体大小
)
.set_series_opts(label_opts=opts.LabelOpts(font_size=16,position='right'))
)
c.render_notebook()
df[df["用户昵称"]=="想太多de猫"].sample(10)
会员等级分布
df2 = df["会员等级"].astype("str").value_counts()
print(df2)
df2 = df2.sort_values(ascending=False)
regions = df2.index.to_list()
values = df2.to_list()
c = (
Pie(init_opts=opts.InitOpts(theme=ThemeType.DARK))
.add("", list(zip(regions,values)))
.set_global_opts(legend_opts = opts.LegendOpts(is_show = False),title_opts=opts.TitleOpts(title="会员等级分布",subtitle="数据来源:腾讯视频\t制图:菜J学Python",pos_top="0.5%",pos_left = 'left'))
.set_series_opts(label_opts=opts.LabelOpts(formatter="等级{b}占比:{d}%",font_size=14))
)
c.render_notebook()
弹幕在讨论些什么
# 定义分词函数
def get_cut_words(content_series):
# 读入停用词表
stop_words = []
with open("/菜J学Python/offer/stop_words.txt", 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
stop_words.append(line.strip())
# 添加关键词
my_words = ['撒老师', '范丞丞','第一季']
for i in my_words:
jieba.add_word(i)
# 自定义停用词
my_stop_words = ['好像', '真的','感觉']
stop_words.extend(my_stop_words)
# 分词
word_num = jieba.lcut(content_series.str.cat(sep='。'), cut_all=False)
# 条件筛选
word_num_selected = [i for i in word_num if i not in stop_words and len(i)>=2]
return word_num_selected
# 绘制词云图
text1 = get_cut_words(content_series=df['弹幕内容'])
stylecloud.gen_stylecloud(text=' '.join(text1), max_words=100,
collocations=False,
font_path='字酷堂清楷体.ttf',
icon_name='fas fa-square',
size=653,
#palette='matplotlib.Inferno_9',
output_name='./offer.png')
Image(filename='./offer.png')
大家如何评论8个实习生
df8 = df["人物提及"].value_counts()[1:11]
print(df8.index.to_list())
print(df8.to_list())
c = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.DARK))
.add_xaxis(df8.index.to_list())
.add_yaxis("",df8.to_list())
.set_global_opts(title_opts=opts.TitleOpts(title="人物提及次数",subtitle="数据来源:腾讯视频 \t制图:菜J学Python",pos_left = 'left'),
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(font_size=13)), #更改横坐标字体大小
yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(font_size=13)), #更改纵坐标字体大小
)
.set_series_opts(label_opts=opts.LabelOpts(font_size=16,position='top'))
)
c.render_notebook()
情感分析
import paddlehub as hub
#这里使用了百度开源的成熟NLP模型来预测情感倾向
senta = hub.Module(name="senta_bilstm")
texts = df['弹幕内容'].tolist()
input_data = {'text':texts}
res = senta.sentiment_classify(data=input_data)
df['情感分值'] = [x['positive_probs'] for x in res]
#重采样至15分钟
df.index = df['发送时间']
data = df.resample('15min').mean().reset_index()
#给数据表添加调色板
import seaborn as sns
color_map = sns.light_palette('orange', as_cmap=True) #light_palette调色板
data.style.background_gradient(color_map)
c = (
Line(init_opts=opts.InitOpts(theme=ThemeType.DARK))
.add_xaxis(data["发送时间"].to_list())
.add_yaxis('情感倾向', list(data["情感分值"].round(2)), is_smooth=True,is_connect_nones=True,areastyle_opts=opts.AreaStyleOpts(opacity=0.5))
.set_global_opts(title_opts=opts.TitleOpts(title="情感倾向",subtitle="数据来源:腾讯视频 \t制图:菜J学Python",pos_left = 'left'))
)
c.render_notebook()
-END- 往期精彩推荐 -- -- 1、小伙子不讲武德,马保国... -- 2、NBA球星数据查询(GUI界面) -- 3、批量下载bilibili视频 -- 留下你的“在看”呗!
评论