豆瓣电影
》》》基本方法
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 from urllib import request 2 3 response = request.urlopen("https://movie.douban.com/top250?start=25&filter=") 4 content = response.read().decode('utf-8') 5 print(content)
》》》代理服务器
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 from urllib import request 2 proxy_support = request.ProxyHandler({'http':'http://xx.xx.xx.xx:xx'}) 3 opener = request.build_opener(proxy_support,request.HTTPHandler) 4 request.install_opener(opener) 5 6 content = request.urlopen('https://movie.douban.com/').read().decode('utf-8') 7 print(content)
》》》伪装成浏览器
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 postdata = parse.urlencode({}) 2 headers ={ 3 4 } 5 6 req = request.Request( 7 url = "http//movie.douban.com ", 8 data = postdata, 9 ) 10 headers = headers
》》》页面信息抓取
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 from urllib import request 2 3 class MoviesTop(object): 4 def __init__(self): 5 self.start = 0 6 self.param = '&filter=' 7 self.headers = {'User-Agent':'Mozilla/5.0(Window NT 10.0;WOW64)'} 8 9 def get_page(self): 10 page_content = [] 11 try: 12 while self.start <= 100: 13 url = 'https://movie.douban.com/top250?start=' + str(self.start) 14 req = request.Request(url,headers = self.headers) 15 response = request.urlopen(req) 16 page = response.read().decode('utf-8') 17 # print(page) 18 page_num = (self.start + 25)//25 19 20 print('抓取'+str(page_num) + '页数据') 21 self.start += 25 22 page_content.append(page) 23 return page_content 24 except request.URLError as e: 25 if hasattr(e,'reason'): 26 print('抓取失败,原因是',e.reason) 27 28 29 def main(self): 30 print('开始抓取数据') 31 self.get_page() 32 print('数据抓取完毕') 33 34 a=MoviesTop() 35 a.main()
》》》提取相关信息
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 import re 2 # .*? 3 html_text =' <p class="quote"><span class="inq">希望让人自由。</span></p>' 4 reObj = re.compile('p class="quote"><span class="inq">(.*?)</span></p>') 5 print(reObj) 6 a = reObj.findall(html_text) 7 print(a)
》》》完整代码并将数据存入txt文件中
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 from urllib import request 2 import re 3 4 class MoviesTop(object): 5 def __init__(self): 6 self.start = 0 7 self.param = '&filter=' 8 self.headers = {'User-Agent':'Mozilla/5.0(Window NT 10.0;WOW64)'} 9 self.movies_list = [] 10 self.file_path = 'D:\movies_spider2.xlsx' 11 12 def get_page(self): 13 try: 14 # while self.start <= 75: 15 url = 'https://movie.douban.com/top250?start=' + str(self.start) 16 req = request.Request(url,headers = self.headers) 17 response = request.urlopen(req) 18 page = response.read().decode('utf-8') 19 page_num = (self.start + 25)//25 20 print('抓取'+str(page_num) + '页数据') 21 self.start += 25 22 return page 23 except request.URLError as e: 24 if hasattr(e,'reason'): 25 print('抓取失败,原因是',e.reason) 26 27 28 29 def get_movies_info(self): 30 # pattern = re.compile(u'<span.*?class="title">(.*?)</span>.*?' 31 # +u'<em.*?class="">(.*?)</em>.*?' 32 # +u'<span.*?class="title">(.*?)</span>.*?',re.S) 33 34 pattern = re.compile(u'<em.*?class="">(.*?)</em>.*?' 35 + u'<span.*?class="title">(.*?)</span>.*?' 36 + u'<span.*?class="title"> / (.*?)</span>.*?' 37 +u'导演:(.*?) .*?' 38 +u'主演:(.*?)<br>.*?' 39 +u'<span>(.*?)人评价</span>.*?' 40 +u'<span.*?class="inq">(.*?)</span>.*?', re.S) 41 42 # pattern = re.compile( u'<em.*?class="">(.*?)</em>.*?' 43 # + u'<span.*?class="title">(.*?)</span>.*?' 44 # + u'p class="quote"><span class="inq">(.*?)</span></p>', re.S)#排名,名称 45 print('>>>',pattern) 46 47 while self.start <50: 48 page = self.get_page() 49 # print(page) 50 movies = re.findall(pattern,page) 51 # print(movies) 52 for movie in movies: 53 print(movie) 54 self.movies_list.append([movie[0],movie[1],movie[2],movie[3],movie[4],movie[5],movie[6]]) 55 print(movie[6]) 56 57 58 def write_text(self): 59 print("开始写数据") 60 file_top = open(self.file_path, 'w', encoding='utf-8') 61 try: 62 for movie in self.movies_list: 63 file_top.write('电影排名: ' + movie[0] + '\r\n') 64 file_top.write('电影名称:' + movie[1] +'\r\n') 65 file_top.write('电影别名:' + movie[2] + '\r\n') 66 file_top.write('导演姓名:' + movie[3] + '\r\n') 67 file_top.write('主演姓名:' + movie[4] + '\r\n') 68 file_top.write('参评人数:' + movie[5] + '\r\n') 69 file_top.write('剪短影评:' + movie[6] + '\r\n') 70 71 print('写入成功') 72 except Exception as e: 73 print(e) 74 finally: 75 file_top.close() 76 77 def main(self): 78 79 print('开始抓取数据') 80 self.get_movies_info() 81 self.write_text() 82 print('数据抓取完毕') 83 84 85 a = MoviesTop() 86 a.main()
>>>完整代码并将数据存入数据库
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 from urllib import request 2 import re, MySQLdb 3 4 5 class MoviesTop(object): 6 def __init__(self): 7 self.start = 0 8 self.param = '&filter=' 9 self.headers = {'User-Agent': 'Mozilla/5.0(Window NT 10.0;WOW64)'} 10 self.movies_list = [] 11 # self.file_path = 'D:\movies_spider2.txt' 12 13 def get_page(self): 14 try: 15 # while self.start <= 75: 16 url = 'https://movie.douban.com/top250?start=' + str(self.start) 17 req = request.Request(url, headers=self.headers) 18 response = request.urlopen(req) 19 page = response.read().decode('utf-8') 20 page_num = (self.start + 25) // 25 21 print('抓取' + str(page_num) + '页数据') 22 self.start += 25 23 return page 24 except request.URLError as e: 25 if hasattr(e, 'reason'): 26 print('抓取失败,原因是', e.reason) 27 28 29 def get_movies_info(self): 30 # pattern = re.compile(u'<span.*?class="title">(.*?)</span>.*?' 31 # +u'<em.*?class="">(.*?)</em>.*?' 32 # +u'<span.*?class="title">(.*?)</span>.*?',re.S) 33 34 pattern = re.compile(u'<em.*?class="">(.*?)</em>.*?' 35 + u'<span.*?class="title">(.*?)</span>.*?' 36 + u'<span.*?class="title"> / (.*?)</span>.*?', re.S) 37 # print('>>>',pattern) 38 39 while self.start < 25: 40 page = self.get_page() 41 # print(page) 42 movies = re.findall(pattern, page) 43 print(movies[1]) 44 print(movies[1][2]) 45 for movie in movies: 46 print(movie) 47 self.movies_list.append([movie[0], movie[1], movie[2]]) 48 # print(self.movies_list) 49 50 51 def insert_into_sql(self): 52 # conn=MySQLdb.connect(host='localhost',port='3306',user='root',password='123666',db='test') 53 conn = MySQLdb.connect(host='localhost', port=3306, user='root', passwd='123666', db='test', charset='utf8') 54 cur = conn.cursor() 55 56 try: 57 for movie in self.movies_list: 58 a = movie[0] 59 print(a) 60 b = movie[1] 61 print(b) 62 c = movie[2] 63 print(c) 64 # insert_str = "insert into movies01 values( 2,'ghj', 'GHJ')" 65 insert_str = "insert into m1 (moviesRank,MoviesName,OtherName) values('%s','%s','%s')" % ( 66 movie[0], movie[1], movie[2]) 67 # insert_sql =insert_str%() 68 cur.execute(insert_str) 69 # cur.close() 70 conn.commit() 71 # conn.close() 72 print('>>>>') 73 except Exception as e: 74 print(e) 75 76 77 def main(self): 78 print('开始抓取数据') 79 self.get_movies_info() 80 self.insert_into_sql() 81 print('数据抓取完毕') 82 83 84 b = MoviesTop() 85 b.main()