Python爬虫爬取搜狐视频电影并存储到mysql数据库
数据获取方式:微信搜索关注【靠谱杨阅读人生】回复【电影】。
整理不易,资源付费,谢谢支持。
代码:
1 import time
2 import traceback
3 import requests
4 from lxml import etree
5 import re
6 from bs4 import BeautifulSoup
7 from lxml.html.diff import end_tag
8 import json
9 import pymysql
10 #连接数据库 获取游标
11 def get_conn():
12 """
13 :return: 连接,游标
14 """
15 # 创建连接
16 conn = pymysql.connect(host="127.0.0.1",
17 user="root",
18 password="000429",
19 db="movierankings",
20 charset="utf8")
21 # 创建游标
22 cursor = conn.cursor() # 执行完毕返回的结果集默认以元组显示
23 if ((conn != None) & (cursor != None)):
24 print("数据库连接成功!游标创建成功!")
25 else:
26 print("数据库连接失败!")
27 return conn, cursor
28 #关闭数据库连接和游标
29 def close_conn(conn, cursor):
30 if cursor:
31 cursor.close()
32 if conn:
33 conn.close()
34 return 1
35
36 def get_souhu():
37 url='https://film.sohu.com/list_0_0_0_2_2_1_60.html?channeled=1200100000'
38 #最新上架
39 new_url='https://film.sohu.com/list_0_0_0_2_1_1_60.html?channeled=1200100000'
40 #本周热播
41 week_url='https://film.sohu.com/list_0_0_0_2_0_1_60.html?channeled=1200100000'
42 headers={
43 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
44 }
45
46 #初始化list
47 templist=[]
48 dataRes=[]
49 #最受好评
50 for i in range(1,31):
51 url_1='https://film.sohu.com/list_0_0_0_2_2_'
52 auto=str(i)
53 url_2='_60.html?channeled=1200100000'
54 url=url_1+auto+url_2
55 response = requests.get(url, headers)
56 response.encoding = 'utf-8'
57 page_text = response.text
58 # etree_ = etree.HTML(page_text)
59 # 获取所有的li
60 soup = BeautifulSoup(page_text, 'lxml')
61 # 标签层级选择
62 li_list = soup.select('.movie-list>li')
63 print(len(li_list))
64 if(len(li_list)==0):
65 print("最受好评爬取结束!")
66 if(len(dataRes)!=0):
67 return dataRes
68 for li in li_list:
69 li_text=str(li)
70 # print(li_text)
71 li_soup=BeautifulSoup(li_text,'lxml')
72 name=li_soup.find('div',class_="v_name_info").text
73 #添加名字
74 templist.append(name)
75 # print(name)
76 #添加评分
77 score=li_soup.find('span',class_='v_score').text
78 #处理评分
79 score=score[-4:-1]
80 templist.append(score)
81 # print(score)
82 #添加path
83 path=li_soup.find('a',target="_blank")['href']
84 templist.append(path)
85 # print(path)
86 #添加播放状态
87 state="VIP"
88 templist.append(state)
89 print(templist)
90 dataRes.append(templist)
91 templist=[]
92 print("-------------------------------------------")
93 # print(len(dataRes))
94
95 # #最新上架
96 #
97 # templist = []
98 # for i in range(1,31):
99 # url_1='https://film.sohu.com/list_0_0_0_2_1_'
100 # auto=str(i)
101 # url_2='_60.html?channeled=1200100000'
102 # url=url_1+auto+url_2
103 # response = requests.get(url, headers)
104 # response.encoding = 'utf-8'
105 # page_text = response.text
106 # # etree_ = etree.HTML(page_text)
107 # # 获取所有的li
108 # soup = BeautifulSoup(page_text, 'lxml')
109 # # 标签层级选择
110 # li_list = soup.select('.movie-list>li')
111 # print(len(li_list))
112 # if(len(li_list)==0):
113 # print("最新上架爬取结束!")
114 # if(len(dataRes)!=0):
115 # return dataRes
116 # for li in li_list:
117 # li_text=str(li)
118 # # print(li_text)
119 # li_soup=BeautifulSoup(li_text,'lxml')
120 # name=li_soup.find('div',class_="v_name_info").text
121 # #添加名字
122 # templist.append(name)
123 # # print(name)
124 # #添加评分
125 # score=li_soup.find('span',class_='v_score').text
126 # #处理评分
127 # score=score[-4:-1]
128 # templist.append(score)
129 # # print(score)
130 # #添加path
131 # path=li_soup.find('a',target="_blank")['href']
132 # templist.append(path)
133 # # print(path)
134 # #添加播放状态
135 # state="VIP"
136 # templist.append(state)
137 # print(templist)
138 # dataRes.append(templist)
139 # templist=[]
140 # print("-------------------------------------------")
141 # # print(len(dataRes))
142 # #本周热播
143 # templist = []
144 # for i in range(1, 31):
145 # url_1 = 'https://film.sohu.com/list_0_0_0_2_0_'
146 # auto = str(i)
147 # url_2 = '_60.html?channeled=1200100000'
148 # url = url_1 + auto + url_2
149 # response = requests.get(url, headers)
150 # response.encoding = 'utf-8'
151 # page_text = response.text
152 # # etree_ = etree.HTML(page_text)
153 # # 获取所有的li
154 # soup = BeautifulSoup(page_text, 'lxml')
155 # # 标签层级选择
156 # li_list = soup.select('.movie-list>li')
157 # print(len(li_list))
158 # if (len(li_list) == 0):
159 # print("本周热播爬取结束!")
160 # if (len(dataRes) != 0):
161 # return dataRes
162 # for li in li_list:
163 # li_text = str(li)
164 # # print(li_text)
165 # li_soup = BeautifulSoup(li_text, 'lxml')
166 # name = li_soup.find('div', class_="v_name_info").text
167 # # 添加名字
168 # templist.append(name)
169 # # print(name)
170 # # 添加评分
171 # score = li_soup.find('span', class_='v_score').text
172 # # 处理评分
173 # score = score[-4:-1]
174 # templist.append(score)
175 # # print(score)
176 # # 添加path
177 # path = li_soup.find('a', target="_blank")['href']
178 # templist.append(path)
179 # # print(path)
180 # # 添加播放状态
181 # state = "VIP"
182 # templist.append(state)
183 # print(templist)
184 # dataRes.append(templist)
185 # templist = []
186 # print("-------------------------------------------")
187 # print(len(dataRes))
188 #list去重
189 # old_list = dataRes
190 # new_list = []
191 # for i in old_list:
192 # if i not in new_list:
193 # new_list.append(i)
194 # print(new_list) # [2, 3, 4, 5, 1]
195 return dataRes
196 #插入数据库
197 def insert_souhu():
198 cursor = None
199 conn = None
200 try:
201 count=0
202 list = get_souhu()
203 print(f"{time.asctime()}开始插入搜狐电影数据")
204 conn, cursor = get_conn()
205 sql = "insert into moviesohu (id,name,score,path,state) values(%s,%s,%s,%s,%s)"
206 for item in list:
207 print(item)
208 count = count + 1
209 #异常捕获,防止数据库主键冲突
210 try:
211 cursor.execute(sql, [0, item[0], item[1], item[2], item[3] ])
212 except pymysql.err.IntegrityError:
213 print("重复!跳过!")
214 conn.commit() # 提交事务 update delete insert操作
215 print(f"{time.asctime()}插入搜狐电影数据完毕")
216 except:
217 traceback.print_exc()
218 finally:
219 close_conn(conn, cursor)
220 return;
221
222 if __name__ == '__main__':
223 # get_iqy()
224 # get_souhu()
225 insert_souhu()
运行截图
数据库截图
建表语句
1 CREATE TABLE `moviesohu` (
2 `id` INT(11) NOT NULL AUTO_INCREMENT,
3 `name` VARCHAR(45) COLLATE utf8_bin NOT NULL,
4 `score` VARCHAR(45) COLLATE utf8_bin NOT NULL,
5 `path` VARCHAR(100) COLLATE utf8_bin NOT NULL,
6 `state` VARCHAR(10) COLLATE utf8_bin NOT NULL,
7 PRIMARY KEY (`name`),
8 KEY `id` (`id`)
9 ) ENGINE=INNODB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
好看请赞,养成习惯:) 本文来自博客园,作者:靠谱杨, 转载请注明原文链接:https://www.cnblogs.com/rainbow-1/p/14772320.html
欢迎来我的51CTO博客主页踩一踩 我的51CTO博客
文章中的公众号名称可能有误,请统一搜索:靠谱杨的秘密基地