Python大作业---微博爬虫及简单数据分析

刚开始学python，选了这个题目，把代码放上来留念，没有用到很流行的框架，所以代码量挺大
GUI用wxpython写的

# _*_ coding: UTF-8 _*_
import os
import re
import requests
import sys
import wx
import traceback
from datetime import datetime
from datetime import timedelta
from lxml import etree
import data_analysis

global file_path
file_path = ''
class Wb(wx.App):
def Operate(self):
self.cookie = {}
self.username = '' # 用户名，如“Dear-迪丽热巴”
self.Number = 0 # 用户全部微博数
self.number1 = 0 # 爬取到的微博数
self.Guanzhu = 0
self.fans = 0
self.Content = [] # 微博内容
self.Time = [] # 微博发布时间
self.star = [] # 微博对应的点赞数
self.Zhuanfa = [] # 微博对应的转发数
self.Pinglun = [] # 微博对应的评论数
self.publish_tool = []
self.Id = 0000
###======================================================================================================
###======================================GUI=============================================================

# 建立一个窗口和frame控件
self.frame_operate = wx.Frame(
None, title="Weibo_Spider_GUI", size=(500, 500))
self.panel_operate = wx.Panel(self.frame_operate, -1)

# 设置字体格式
self.font1 = wx.Font(18, wx.ROMAN, wx.ITALIC, wx.NORMAL)
self.label1 = wx.StaticText(
self.panel_operate, -1, "WeiBo Spider", pos=(180, 60), style=wx.ALIGN_CENTER)
self.label1.SetFont(self.font1)

# cookie的标签和文本框
self.label2 = wx.StaticText(
self.panel_operate, -1, "请输入您微博登陆的有效cookie", pos=(160, 130), style=wx.ALIGN_CENTER)
self.textCookie = wx.TextCtrl(
self.panel_operate, -1, pos=(200, 150), size=(80, 20), style=wx.TE_CENTER)

# 获取所爬取用户的self.Id
self.label3 = wx.StaticText(
self.panel_operate, -1, "请输入您所要爬取微博账号的self.Id", pos=(160, 180), style=wx.ALIGN_CENTER)
self.textId = wx.TextCtrl(
self.panel_operate, -1, pos=(200, 200), size=(80, 20), style=wx.TE_CENTER)

# 文件存储路径
self.label4 = wx.StaticText(self.panel_operate,-1,"数据文件保存路径", pos=(160,230),style=wx.ALIGN_CENTER)
self.textFile_path = wx.TextCtrl(self.panel_operate,-1,pos=(200,250),size=(80,20),style=wx.TE_CENTER)

# 设置开始爬虫按钮
self.button_start = wx.Button(self.panel_operate, -1, "开始爬取微博信息", pos=(200, 350))
# 绑定响应事件
self.Bind(wx.EVT_BUTTON, self.get_cookie, self.button_start)
self.frame_operate.Show()

# 微博的正式UI界面-----------------------------------------------------------------------------------
# 获取用户输入的参数值

#注意getvalue不能和用户输入放在一个函数里，要分开写，而且，获取不同的值，也要放在不同函数里！！

def get_cookie(self,event):
self.cookie = {"Cookie": self.textCookie.GetValue()}
self.Id=int(self.textId.GetValue())
global file_path
file_path = self.textFile_path.GetValue()+os.sep+"%d" % self.Id + ".txt"
self.Onbutton_Start()

def Onbutton_Start(self):
self.GetName() #获取用户名
self.GetSimple_Info() # 获取微博数，转发量，关注数，粉丝数
self.weibo_para()
self.write_txt()
self.weibo_UI1()

def weibo_UI1(self):
# 建立新的窗口，展示用户的信息
# 弹出文本框：数据读取完毕
message = "文件爬取完毕"
wx.MessageBox(message)
self.weibo_UI2()

def weibo_UI2(self):
self.frame_operate.Destroy()
self.frame_Info = wx.Frame(None,title="User_Information",size=(500,500))
self.panel_Info = wx.Panel(self.frame_Info,-1)
t1 = "用户昵称：" + str(self.username)
t2 = "微博数:" + str(self.Number)
t3 = "粉丝数:"+str(self.fans)
t4 = "关注数："+str(self.Guanzhu)
self.label16 = wx.StaticText(self.panel_Info,-1,self.username,pos=(200,100),style=wx.ALIGN_LEFT)
self.label5 = wx.StaticText(self.panel_Info,-1,t1,pos=(180,130),style=wx.ALIGN_LEFT)
self.label13 = wx.StaticText(self.panel_Info,-1,t2,pos=(180,150),style=wx.ALIGN_LEFT)
self.label14 = wx.StaticText(self.panel_Info,-1,t3,pos=(180,170),style=wx.ALIGN_LEFT)
self.label15 = wx.StaticText(self.panel_Info,-1,t4,pos=(180,190),style=wx.ALIGN_LEFT)
self.font2 = wx.Font(13,wx.SCRIPT,wx.ITALIC,wx.NORMAL) #小字体 font1大字体
self.label16.SetFont(self.font1)
self.label5.SetFont(self.font2)
self.label13.SetFont(self.font2)
self.label14.SetFont(self.font2)
self.label15.SetFont(self.font2)

self.button_news = wx.Button(self.panel_Info,-1,"查看最近微博",pos=(220,280))
self.Bind(wx.EVT_BUTTON,self.weibo_UI3 ,self.button_news)
self.frame_Info.Show()

# 最进微博
def weibo_UI3(self,event):
self.frame_Info.Destroy()
self.frame_news = wx.Frame(None,title="---",size=(500,500))
self.panel_news = wx.Panel(self.frame_news,-1)
label18 = wx.StaticText(self.panel_news,-1,"最新微博动态",pos=(200,40))
if self.Content:
text1 = "最新/置顶微博为: " + self.Content[0]
text2 = "最新/置顶微博发布工具: " + self.publish_tool[0]
text3 = "最新/置顶微博发布时间: " + self.Time[0]
text4 = "最新/置顶微博获得赞数: " + str(self.star[0])
text5 = "最新/置顶微博获得转发数: " + str(self.Zhuanfa[0])
text6 = "最新/置顶微博获得评论数: " + str(self.Pinglun[0])

self.label6 = wx.TextCtrl(self.panel_news,-1,text1,pos=(90,60),size=(250,140), style=wx.TE_MULTILINE|wx.TE_RICH)
self.label7 = wx.StaticText(self.panel_news,-1,text2,pos=(90,200),style=wx.ALIGN_LEFT)
self.label8 = wx.StaticText(self.panel_news,-1,text3,pos=(90,220),style=wx.ALIGN_LEFT)
self.label9 = wx.StaticText(self.panel_news,-1,text4,pos=(90,240),style=wx.ALIGN_LEFT)
self.label10 = wx.StaticText(self.panel_news,-1,text5,pos=(90,260),style=wx.ALIGN_LEFT)
self.label11 = wx.StaticText(self.panel_news,-1,text6,pos=(90,280),style=wx.ALIGN_LEFT)

# 查看微博信息
self.Button_info = wx.Button(self.panel_news,-1,"点击查看之前的微博内容",pos=(220,340))
self.Bind(wx.EVT_BUTTON,self.weibo_pre_info,self.Button_info)
# 查看爬虫信息的文档
self.Button_file = wx.Button(self.panel_news,-1,"点击查看微博数据分析图表",pos=(220,380))
self.Bind(wx.EVT_BUTTON,self.analysis_UI,self.Button_file)
self.frame_news.Show()

def analysis_UI(self,event):
self.frame_data = wx.Frame(None,title="data_analysis--20177830115",size=(500,500))
self.panel_data = wx.Panel(self.frame_data,-1)
text1 = "2017-2018微博转发/点赞量折线统计图"
text2 = '原创微博与转发微博统计图'
text3 = '微博发布工具统计图'
text4 = '微博使用心情统计图'
self.button_1 = wx.Button(self.panel_data,-1,text1,pos=(180,120))
self.button_2 = wx.Button(self.panel_data,-1,text2,pos=(180,160))
self.button_3 = wx.Button(self.panel_data,-1,text3,pos=(180,200))
self.button_4 = wx.Button(self.panel_data,-1,text4,pos=(180,240))
self.Bind(wx.EVT_BUTTON,self.figure_1,self.button_1)
self.Bind(wx.EVT_BUTTON,self.figure_2,self.button_2)
self.Bind(wx.EVT_BUTTON,self.figure_3,self.button_3)
self.Bind(wx.EVT_BUTTON,self.figure_4,self.button_4)
self.frame_data.Show()

def figure_1(self,event):
global file_path
figure = data_analysis.analysis(file_path,self.Number)
figure.analyse_Zhexian()

def figure_2(self,event):
global file_path
figure = data_analysis.analysis(file_path,self.Number)
figure.analyse_YC()

def figure_3(self,event):
global file_path
figure = data_analysis.analysis(file_path,self.Number)
figure.analyse_GJ()

def figure_4(self,event):
global file_path
figure = data_analysis.analysis(file_path,self.Number)
figure.analyse_XQ()

def weibo_pre_info(self,event): ## 过度函数，为了让不断进入weibo_info函数中（分条输出）不报错。（多次进入没有event触发）
self.weibo_info()

def weibo_info(self):
#flag = 1#计次函数，flag==1，继续循环，flag==0退出循环，即不展示下一条微博 ## 这坑爹玩意根本不能用for循环，所以我只能不断进入函数
self.s = wx.Frame(None,title="---",size=(500,500))
self.f = wx.Panel(self.s,-1)
#for i in range(1,self.Number+1):
text1 = str(self.a+1)+":" + self.Content[self.a]
text2 = "发布工具: " + self.publish_tool[self.a]
text3 = "发布时间: " + self.Time[self.a]
text4 = "点赞数: " + str(self.star[self.a])
text5 = "转发数: " + str(self.Zhuanfa[self.a])
text6 = "评论数: " + str(self.Pinglun[self.a])

self.labela = wx.TextCtrl (self.f,-1,text1,pos=(80, 60),size=(250,140),style=wx.TE_MULTILINE|wx.TE_RICH) ##坑爹玩意，静态文本控件只能单行输出，就是不能多行！网上查的可以通过“...XXX~r XXX..”这样，
#但是相当无比麻烦，而且输出都是乱的，除非一条条设置？可能吗！！于是剑走偏锋，选择了用textCtr控件代替静态文本，就是可以改变框里的值，但是效果确实达到了。
self.labelb = wx.StaticText(self.f,-1,text2,pos=(80,200),style=wx.ALIGN_LEFT)
self.labelc = wx.StaticText(self.f,-1,text3,pos=(80,220),style=wx.ALIGN_LEFT)
self.labeld = wx.StaticText(self.f,-1,text4,pos=(80,240),style=wx.ALIGN_LEFT)
self.labele = wx.StaticText(self.f,-1,text5,pos=(80,260),style=wx.ALIGN_LEFT)
self.labelf = wx.StaticText(self.f,-1,text6,pos=(80,280),style=wx.ALIGN_LEFT)

self.button_next=wx.Button(self.f,-1,"查看下一条",pos=(300,380))
self.button_exit=wx.Button(self.f,-1,"关闭",pos=(100,380))

self.Bind(wx.EVT_BUTTON,self.exit,self.button_exit)
self.Bind(wx.EVT_BUTTON,self.cont,self.button_next)
self.s.Show()

def exit(self,event):
self.s.Destroy()

def cont(self,event):
self.a += 1
self.s.Destroy()
self.weibo_info()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
具体爬虫部分，参考github某大佬的

# 获取用户昵称
def GetName(self):
url = "https://weibo.cn/%d/info" % (self.Id)
html = requests.get(url, cookies=self.cookie).content
selector = etree.HTML(html) # 将返回的html文档的标签补足
username = selector.xpath("//title/text()")[0]# 获取标签title的所有内容。第一个title就是列表的一个
self.username = username[:-3] # XXX的微博，后面三个字切片即为用户昵称

# 获取用户微博数、关注数、粉丝数
def GetSimple_Info(self):

url = "https://weibo.cn/u/%d?&page=1" % (self.Id)
html = requests.get(url, cookies=self.cookie).content
selector = etree.HTML(html) # 转化为标准的HTML
pattern = r"\d+\.?\d*"

# 微博数
wb_num = selector.xpath("//div[@class='tip2']/span[@class='tc']/text()")[0]# <div class="tip2"><span class="tc">微博[1543]</span>&nbsp
regx = re.findall(pattern, wb_num, re.S | re.M) # 只要数字（字符）
for value in regx:
num_wb = int(value)
break
self.Number = num_wb

# 关注数
str_gz = selector.xpath("//div[@class='tip2']/a/text()")[0]
regx = re.findall(pattern, str_gz, re.M)
self.Guanzhu = int(regx[0])

# 粉丝数
str_fs = selector.xpath("//div[@class='tip2']/a/text()")[1]
regx = re.findall(pattern, str_fs, re.M)
self.fans = int(regx[0])

# 获取"长微博"全部文字内容
def GetLong(self, weibo_link):
html = requests.get(weibo_link, cookies=self.cookie).content
selector = etree.HTML(html)
info = selector.xpath("//div[@class='c']")[1]
wb_content = info.xpath("div/span[@class='ctt']")[0].xpath(
"string(.)").replace(u"\u200b", "").encode(sys.stdout.encoding, "ignore").decode(
sys.stdout.encoding)
return wb_content

# 获取转发微博信息
def GetZhuanfa(self, is_retweet, info, wb_content):
original_user = is_retweet[0].xpath("a/text()")
if not original_user:
wb_content = u"转发微博已被删除"
return wb_content
else:
original_user = original_user[0]
retweet_reason = info.xpath("div")[-1].xpath("string(.)").replace(u"\u200b", "").encode(
sys.stdout.encoding, "ignore").decode(
sys.stdout.encoding)
retweet_reason = retweet_reason[:retweet_reason.rindex(u"赞")]
wb_content = (retweet_reason + "\n" + u"原始用户: " +
original_user + "\n" + u"转发内容: " + wb_content)
return wb_content

#一个界面展示一条微博的发布时间、点赞数、转发数、评论数
def weibo_para(self):

url = "https://weibo.cn/u/%d?&page=1" % (self.Id)
html = requests.get(url, cookies=self.cookie).content
selector = etree.HTML(html)
if selector.xpath("//input[@name='mp']") == []:
page_num = 1
else:
page_num = (int)(selector.xpath(
"//input[@name='mp']")[0].attrib["value"])
pattern = r"\d+\.?\d*"
for page in range(1, page_num + 1):
url2 = "https://weibo.cn/u/%d?&page=%d" % (
self.Id, page)
html2 = requests.get(url2, cookies=self.cookie).content
selector2 = etree.HTML(html2)
info = selector2.xpath("//div[@class='c']")
is_empty = info[0].xpath("div/span[@class='ctt']")
if is_empty:
for i in range(0, len(info) - 2):
# 微博内容
str_t = info[i].xpath("div/span[@class='ctt']")
Content = str_t[0].xpath("string(.)").replace(u"\u200b", "").encode(
sys.stdout.encoding, "ignore").decode(
sys.stdout.encoding)
Content = Content[:-1]
weibo_Id = info[i].xpath("@id")[0][2:]
a_link = info[i].xpath(
"div/span[@class='ctt']/a")
is_retweet = info[i].xpath("div/span[@class='cmt']")
if a_link:
if a_link[-1].xpath("text()")[0] == u"全文":
weibo_link = "https://weibo.cn/comment/" + weibo_Id
wb_content = self.GetLong(weibo_link)
if wb_content:
if not is_retweet:
wb_content = wb_content[1:]
Content = wb_content
if is_retweet:
Content = self.GetZhuanfa(
is_retweet, info[i], Content)
self.Content.append(Content)

# 微博发布时间
str_time = info[i].xpath("div/span[@class='ct']")
str_time = str_time[0].xpath("string(.)").encode(sys.stdout.encoding, "ignore").decode(
sys.stdout.encoding)
Time = str_time.split(u'来自')[0]
if u"刚刚" in Time:
Time = datetime.now().strftime('%Y-%m-%d %H:%M')
elif u"分钟" in Time:
minute = Time[:Time.find(u"分钟")]
minute = timedelta(minutes=int(minute))
Time = (datetime.now() - minute).strftime("%Y-%m-%d %H:%M")
elif u"今天" in Time:
today = datetime.now().strftime("%Y-%m-%d")
time = Time[3:]
Time = today + " " + time
elif u"月" in Time:
year = datetime.now().strftime("%Y")
month = Time[0:2]
day = Time[3:5]
time = Time[7:12]
Time = (
year + "-" + month + "-" + day + " " + time)
else:
Time = Time[:16]
self.Time.append(Time)

str_footer = info[i].xpath("div")[-1]
str_footer = str_footer.xpath("string(.)").encode(
sys.stdout.encoding, "ignore").decode(sys.stdout.encoding)
str_footer = str_footer[str_footer.rfind(u'赞'):]
regx = re.findall(pattern, str_footer, re.M)

# 微博发布工具
if len(str_time.split(u'来自')) > 1:
publish_tool = str_time.split(u'来自')[1]
else:
publish_tool = u"无"
self.publish_tool.append(publish_tool)

str_footer = info[i].xpath("div")[-1]
str_footer = str_footer.xpath("string(.)").encode(
sys.stdout.encoding, "ignore").decode(sys.stdout.encoding)
str_footer = str_footer[str_footer.rfind(u'赞'):]
guid = re.findall(pattern, str_footer, re.M)

# 点赞数
star = int(regx[0])
self.star.append(star)

# 转发数
Zhuanfa = int(regx[1])
self.Zhuanfa.append(Zhuanfa)

# 评论数
Pinglun = int(regx[2])
self.Pinglun.append(Pinglun)
self.number1 += 1

# 将爬取的信息写入文件--------------------------------------------------------------------------
def write_txt(self):
try:
contents_header = u"\n\n微博内容: \n"
contents = (u"用户信息\n用户昵称：" + self.username +
u"\n用户Id: " + str(self.Id) +
u"\n微博数: " + str(self.Number) +
u"\n关注数: " + str(self.Guanzhu) +
u"\n粉丝数: " + str(self.fans) + contents_header + '\n')

for i in range(1, self.number1 + 1):
text = (str(i) + ":" + self.Content[i - 1] + "\n" +
u"发布工具: " + self.publish_tool[i - 1] + "\n" +
u"发布时间: " + self.Time[i - 1] + "\n" +
u"点赞数: " + str(self.star[i - 1]) +
u"转发数: " + str(self.Zhuanfa[i - 1]) +
u"评论数: " + str(self.Pinglun[i - 1]) + "\n\n")
contents = contents + text

global file_path
f = open(file_path, "wb")
f.write(contents.encode(sys.stdout.encoding))
f.close()

except Exception as e:
print("Error: ", e)
traceback.print_exc()

def main():
weibo = Wb()
weibo.Operate()
weibo.MainLoop()

if __name__ == "__main__":
main()
1
2
3
4
5
6
7
数据分析部分：用matplotlib制图，只是粗浅学了一些，所以画的不够精美，数据过少，分析的可能有点问题，emmm，一共画了四张图， “2017-2018微博转发/点赞量折线统计图”、 ‘原创微博与转发微博统计图’ 、 ‘微博发布工具统计图’、 ‘微博使用心情统计图’

import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdate
from matplotlib import font_manager as fm
import time
from datetime import datetime
import webbrowser

class analysis(object):

def __init__(self,file_name,number):
self.file_name = file_name
self.number = number
self.X_data = []
self.Y1_data = []
self.Y_data = []
self.str = ""

## 折线图展示窗口
def analyse_Zhexian(self):
pattern = re.compile(r'转发数: \d+') # 查找数字
pattern1 = re.compile(r'\d+')#匹配转发数或者评论数的数字字符
pattern2 = re.compile(r'发布时间: (\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})')#提取时间
pattern3 = re.compile(r'.*2016.*')
pattern4 = re.compile(r'点赞数: \d+')

with open(self.file_name,encoding = "utf-8") as f:
str = f.read()
# 用正则表达式提取所需数据
result = pattern.findall(str)
ls3 = ''.join(result)#拼接成一个字符串
Result = pattern1.findall(ls3)#把所有数字提取完毕
Num_Zhuanfa = [ int(x) for x in Result ] # y轴1

result1 = pattern4.findall(str)
ls1 = ''.join(result1)
Result1 = pattern1.findall(ls1)
Num_Dianzan = [int(x) for x in Result1] # y轴2

# 将x轴数据转化为datetime 类型
Num_Zhuanfa_time = pattern2.findall(str)
for i in range(0,len(Num_Zhuanfa_time)):
if pattern3.findall(Num_Zhuanfa_time[i]):
stop = i # 用stop记录2016年的最后一条微博
break

#只选择2017-2018年两年的数据，因为微博数太多，横轴日期占比太大，matplotlib的横坐标显示不完全，中间会有大量重叠，这里数据分析的算法并不好，结果出来还是会有很大重叠，如果有更好的办法请大佬指教
Num_Zhuanfa = Num_Zhuanfa[0:stop:1]
Num_Zhuanfa_time = Num_Zhuanfa_time[0:stop:1]
Num_Dianzan = Num_Dianzan[0:stop:1]

# 数据除以1000，画图更美观
for i in range(0,len(Num_Zhuanfa)):
Num_Zhuanfa[i] = Num_Zhuanfa[i] /1000
for i in range(0,len(Num_Dianzan)):
Num_Dianzan[i] = Num_Dianzan[i] /1000

#将时间转化为时间戳再转化为datetime类型
aa=[time.strptime(i, "%Y-%m-%d %H:%M") for i in Num_Zhuanfa_time]
timeStamp = [int(time.mktime(a)) for a in aa]
Num_Zhuanfa_time=[datetime.fromtimestamp(k) for k in timeStamp]

# 处理数据量过多的问题
number = len(Num_Zhuanfa)
Group = int(0.18 * number)
k = number // Group # 数据太多，这里只要15%的数据，分组，每组随机选一个作为代表数据
for i in range(0,Group):
self.X_data.append(Num_Zhuanfa_time[i*k])
self.Y_data.append(Num_Zhuanfa[i*k])
self.Y1_data.append(Num_Dianzan[i*k])

# 绘制两条折线
fig1 = plt.figure(figsize=(8,5))
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
ax1 = fig1.add_subplot(1,1,1)
ax1.xaxis.set_major_formatter(mdate.DateFormatter('%Y-%m-%d %H-%M'))#设置时间标签显示格式
plt.xticks(self.X_data,rotation=90)#竖着输出时间
plt.yticks(np.linspace(0,5000,5,endpoint=True))
plt.title(u"2017-2018微博转发/点赞量折线图",color="black")
plt.plot(self.X_data,self.Y_data,"o-",color='skyblue',label="转发量",markersize=1.5) #折线
plt.plot(self.X_data,self.Y1_data,"o-",color='pink',label="点赞量",markersize=1.5)
plt.xlabel("发布时间")
plt.ylabel("数量(千/条)")
plt.legend() # 显示标签
plt.show()

def analyse_YC(self):
pattern = re.compile(r'转发理由')

with open(self.file_name,encoding = "utf-8") as f:
str = f.read()
Zhuanfa = pattern.findall(str)
Number_Zhuanfa = int(len(Zhuanfa))
Yuanchuang = self.number - Number_Zhuanfa

plt.rcParams['font.sans-serif'] = ['SimHei']
labels = ['转发微博','原创微博']
sizes = [Number_Zhuanfa,Yuanchuang]
explode= (0.1,0)
plt.pie(sizes,explode=explode,labels=labels,autopct='%1.1f%%',shadow=False,startangle=150)
plt.title(u"原创与转发微博量",color="black")
plt.show()

def analyse_GJ(self):
pattern = re.compile(r'发布工具: (.*)\n发布时间')
with open(self.file_name,encoding = "utf-8") as f:
str = f.read()
number_GJ = pattern.findall(str)
#print(number_GJ)
gongju = dict()
for i in number_GJ:
name = i
if name in gongju:
gongju[name]+=1
else:
gongju[name]=1

# 少于10的记录舍去
for key in list(gongju.keys()):
if gongju[key]<=10:
del gongju[key]

labels = list(gongju.keys())
sizes = list(gongju.values())
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.pie(sizes,labels=labels,autopct='%1.1f%%',shadow=True,startangle=150)
plt.title(u"微博发布工具统计",color="black")
plt.show()

def analyse_XQ(self):
pattern = re.compile(r'\[(.{1,4})\].*\[(.{1,4})\]')
with open(self.file_name,encoding = "utf-8") as f:
str = f.read()
number_XQ = pattern.findall(str)
# print(number_XQ)
a=[]
for i in range(0,len(number_XQ)):
for j in (range(0,len(number_XQ[i]))):
a.append(number_XQ[i][j])

biaoqing = dict()

for i in a:
name = i
if name in biaoqing:
biaoqing[name]+=1
else:
biaoqing[name]=1

for key in list(biaoqing.keys()):
if biaoqing[key] <= 2:
del biaoqing[key]

labels = list(biaoqing.keys())
sizes = list(biaoqing.values())
fig1, ax1 = plt.subplots(http://www.my516.com)
patches, texts, autotexts = ax1.pie(sizes, labels=labels, autopct='%1.0f%%',
shadow=False, startangle=170)
ax1.axis('equal')
#重新设置字体大小
plt.rcParams['font.sans-serif'] = ['SimHei']
proptease = fm.FontProperties()
proptease.set_size('small')
plt.title(u"微博表情使用次数",color="black")
plt.setp(autotexts, fontproperties=proptease)
plt.setp(texts, fontproperties=proptease)
plt.show()