利用爬虫爬出数据,词频统计
import requests
import re
import jieba
response = requests.get('http://www.haha56.net/xiaohua/gushi/list_1_2.html')
# print(response.status_code)
response.encoding = 'gbk'
data = response.text
# print(data)
content_res = re.findall('<dd class="preview">(.*?)</dd>',data)
res = str(content_res)
res_cut = jieba.lcut(res)
dic = {}
for i in res_cut:
if len(i) == 1:
continue
if i == '...':
continue
if i in dic:
dic[i] += 1
else:
dic[i] = 1
def func(i):
return i[1]
dic_list = list(dic.items())
dic_list.sort(key=func)
dic_list.reverse()
print(dic_list)
词云
import requests
import re
import wordcloud
response = requests.get('http://www.haha56.net/xiaohua/gushi/list_1_2.html')
response.encoding = 'gbk'
data = response.text
content_res = re.findall('<dd class="preview">(.*?)</dd>',data)
res = ''.join(content_res)
w = wordcloud.WordCloud(font_path=r'C:\Windows\Fonts\LHANDW')
w.generate(res)
w.to_file("ciyun.png")
爬取图片
import requests
import re
response = requests.get('http://www.nipic.com/design/acg/renwu/index.html?page=1')
data = response.text
# print(data)
img_url_res = re.findall('data-src="(.*?)"',data)
for i in img_url_res:
img_response = requests.get(i)
img_data = img_response.content
img_name = i.split('/')[-1]
f=open(img_name,'wb')
f.write(img_data)
# f.flush() # 快速刷新
爬取视频
import requests
import re
response = requests.get('http://www.mod.gov.cn/v/index.htm')
data = response.text
mp4_res2 = re.findall('<a href="(.*?)">', data)
for i in mp4_res2: # type:str
res = re.findall('(.*?htm)', i)[0]
res = 'http://www.mod.gov.cn/v/' + res
response = requests.get(res)
data = response.text
url_res = re.findall('//Video (.*?.mp4)',data)[0]
mp4_response = requests.get(url_res)
mp4_data = mp4_response.content
f = open('test.mp4','wb')
f.write(mp4_data)