Python爬虫(七)
源码:
1 import requests 2 import re 3 from my_mysql import MysqlConnect 4 5 # 获取详情页链接和电影名称 6 def get_urls(page): 7 url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'.format(page) 8 response = requests.get(url) 9 response.encoding = 'gbk' 10 # print(res) 11 pat = r'<a href="(.*?)" class="ulink">(.*?)</a>' 12 res = re.findall(pat, response.text) 13 # print(res) 14 return res 15 16 # 获取磁力链接 17 def get_links(url): 18 response = requests.get(url) 19 response.encoding = 'gbk' 20 html = response.text 21 # print(res) 22 pat = r'href="(magnet.*?)"' 23 res = re.search(pat, html) 24 magnet = res.group(1) 25 pat = r'href="(ftp.*?)"' 26 res = re.search(pat, html) 27 ftp = res.group(1) 28 return magnet,ftp 29 30 if __name__ == '__main__': 31 mc = MysqlConnect('127.0.0.1', 'root', '123456', 'homework') 32 for page in range(1,4): 33 res = get_urls(page) 34 for url, name in res: 35 url = 'http://www.dytt8.net/' + url 36 movie_tuple = get_links(url) 37 sql = 'insert into dytt(id,name,magnet,ftp) values(null,{},{},{})'.format(repr(name),repr(movie_tuple[0]),repr(movie_tuple[1])) 38 print(sql) 39 mc.exec(sql)