豆瓣电影

》》》基本方法

1 from urllib import request
2 
3 response = request.urlopen("https://movie.douban.com/top250?start=25&filter=")
4 content = response.read().decode('utf-8')
5 print(content)
View Code

》》》代理服务器

1 from urllib import request
2 proxy_support = request.ProxyHandler({'http':'http://xx.xx.xx.xx:xx'})
3 opener = request.build_opener(proxy_support,request.HTTPHandler)
4 request.install_opener(opener)
5 
6 content = request.urlopen('https://movie.douban.com/').read().decode('utf-8')
7 print(content)
View Code

》》》伪装成浏览器

 1 postdata = parse.urlencode({})
 2 headers ={
 3 
 4 }
 5 
 6 req = request.Request(
 7     url = "http//movie.douban.com    ",
 8     data = postdata,
 9 )
10 headers = headers
View Code

》》》页面信息抓取

 1 from urllib import request
 2 
 3 class MoviesTop(object):
 4     def __init__(self):
 5         self.start = 0
 6         self.param = '&filter='
 7         self.headers = {'User-Agent':'Mozilla/5.0(Window NT 10.0;WOW64)'}
 8 
 9     def get_page(self):
10         page_content = []
11         try:
12             while self.start <= 100:
13                 url = 'https://movie.douban.com/top250?start=' + str(self.start)
14                 req = request.Request(url,headers = self.headers)
15                 response = request.urlopen(req)
16                 page = response.read().decode('utf-8')
17                 # print(page)
18                 page_num = (self.start + 25)//25
19 
20                 print('抓取'+str(page_num) + '页数据')
21                 self.start += 25
22                 page_content.append(page)
23             return page_content
24         except request.URLError as e:
25             if hasattr(e,'reason'):
26                 print('抓取失败,原因是',e.reason)
27 
28 
29     def main(self):
30         print('开始抓取数据')
31         self.get_page()
32         print('数据抓取完毕')
33 
34 a=MoviesTop()
35 a.main()
View Code

》》》提取相关信息

1 import re
2 # .*?
3 html_text =' <p class="quote"><span class="inq">希望让人自由。</span></p>'
4 reObj = re.compile('p class="quote"><span class="inq">(.*?)</span></p>')
5 print(reObj)
6 a = reObj.findall(html_text)
7 print(a)
View Code

》》》完整代码并将数据存入txt文件中

 1 from urllib import request
 2 import re
 3 
 4 class MoviesTop(object):
 5     def __init__(self):
 6         self.start = 0
 7         self.param = '&filter='
 8         self.headers = {'User-Agent':'Mozilla/5.0(Window NT 10.0;WOW64)'}
 9         self.movies_list = []
10         self.file_path = 'D:\movies_spider2.xlsx'
11 
12     def get_page(self):
13         try:
14             # while self.start <= 75:
15                 url = 'https://movie.douban.com/top250?start=' + str(self.start)
16                 req = request.Request(url,headers = self.headers)
17                 response = request.urlopen(req)
18                 page = response.read().decode('utf-8')
19                 page_num = (self.start + 25)//25
20                 print('抓取'+str(page_num) + '页数据')
21                 self.start += 25
22                 return page
23         except request.URLError as e:
24             if hasattr(e,'reason'):
25                 print('抓取失败,原因是',e.reason)
26 
27 
28 
29     def get_movies_info(self):
30         # pattern = re.compile(u'<span.*?class="title">(.*?)</span>.*?'
31         #                      +u'<em.*?class="">(.*?)</em>.*?'
32         #                      +u'<span.*?class="title">(.*?)</span>.*?',re.S)
33 
34         pattern = re.compile(u'<em.*?class="">(.*?)</em>.*?'
35                              + u'<span.*?class="title">(.*?)</span>.*?'
36                              + u'<span.*?class="title">&nbsp;/&nbsp;(.*?)</span>.*?'
37                              +u'导演:(.*?)&nbsp;&nbsp;&nbsp;.*?'
38                              +u'主演:(.*?)<br>.*?'
39                              +u'<span>(.*?)人评价</span>.*?'
40                              +u'<span.*?class="inq">(.*?)</span>.*?', re.S)
41 
42         # pattern = re.compile( u'<em.*?class="">(.*?)</em>.*?'
43         #                      + u'<span.*?class="title">(.*?)</span>.*?'
44         #                      + u'p class="quote"><span class="inq">(.*?)</span></p>', re.S)#排名,名称
45         print('>>>',pattern)
46 
47         while self.start <50:
48             page = self.get_page()
49             # print(page)
50             movies = re.findall(pattern,page)
51             # print(movies)
52             for movie in movies:
53                 print(movie)
54                 self.movies_list.append([movie[0],movie[1],movie[2],movie[3],movie[4],movie[5],movie[6]])
55                 print(movie[6])
56 
57 
58     def write_text(self):
59         print("开始写数据")
60         file_top = open(self.file_path, 'w', encoding='utf-8')
61         try:
62             for movie in self.movies_list:
63                 file_top.write('电影排名: ' +  movie[0] + '\r\n')
64                 file_top.write('电影名称:' +  movie[1] +'\r\n')
65                 file_top.write('电影别名:' + movie[2] + '\r\n')
66                 file_top.write('导演姓名:' + movie[3] + '\r\n')
67                 file_top.write('主演姓名:' + movie[4] + '\r\n')
68                 file_top.write('参评人数:' + movie[5] + '\r\n')
69                 file_top.write('剪短影评:' + movie[6] + '\r\n')
70 
71             print('写入成功')
72         except Exception as e:
73             print(e)
74         finally:
75             file_top.close()
76 
77     def main(self):
78 
79         print('开始抓取数据')
80         self.get_movies_info()
81         self.write_text()
82         print('数据抓取完毕')
83 
84 
85 a = MoviesTop()
86 a.main()
View Code

 >>>完整代码并将数据存入数据库

 1 from urllib import request
 2 import re, MySQLdb
 3 
 4 
 5 class MoviesTop(object):
 6     def __init__(self):
 7         self.start = 0
 8         self.param = '&filter='
 9         self.headers = {'User-Agent': 'Mozilla/5.0(Window NT 10.0;WOW64)'}
10         self.movies_list = []
11         # self.file_path = 'D:\movies_spider2.txt'
12 
13     def get_page(self):
14         try:
15             # while self.start <= 75:
16             url = 'https://movie.douban.com/top250?start=' + str(self.start)
17             req = request.Request(url, headers=self.headers)
18             response = request.urlopen(req)
19             page = response.read().decode('utf-8')
20             page_num = (self.start + 25) // 25
21             print('抓取' + str(page_num) + '页数据')
22             self.start += 25
23             return page
24         except request.URLError as e:
25             if hasattr(e, 'reason'):
26                 print('抓取失败,原因是', e.reason)
27 
28 
29 def get_movies_info(self):
30     # pattern = re.compile(u'<span.*?class="title">(.*?)</span>.*?'
31     #                      +u'<em.*?class="">(.*?)</em>.*?'
32     #                      +u'<span.*?class="title">(.*?)</span>.*?',re.S)
33 
34     pattern = re.compile(u'<em.*?class="">(.*?)</em>.*?'
35                          + u'<span.*?class="title">(.*?)</span>.*?'
36                          + u'<span.*?class="title">&nbsp;/&nbsp;(.*?)</span>.*?', re.S)
37     # print('>>>',pattern)
38 
39     while self.start < 25:
40         page = self.get_page()
41         # print(page)
42         movies = re.findall(pattern, page)
43         print(movies[1])
44         print(movies[1][2])
45         for movie in movies:
46             print(movie)
47             self.movies_list.append([movie[0], movie[1], movie[2]])
48             # print(self.movies_list)
49 
50 
51 def insert_into_sql(self):
52     # conn=MySQLdb.connect(host='localhost',port='3306',user='root',password='123666',db='test')
53     conn = MySQLdb.connect(host='localhost', port=3306, user='root', passwd='123666', db='test', charset='utf8')
54     cur = conn.cursor()
55 
56     try:
57         for movie in self.movies_list:
58             a = movie[0]
59             print(a)
60             b = movie[1]
61             print(b)
62             c = movie[2]
63             print(c)
64             # insert_str = "insert into movies01 values( 2,'ghj', 'GHJ')"
65             insert_str = "insert into m1 (moviesRank,MoviesName,OtherName) values('%s','%s','%s')" % (
66                 movie[0], movie[1], movie[2])
67             # insert_sql =insert_str%()
68             cur.execute(insert_str)
69             # cur.close()
70             conn.commit()
71             # conn.close()
72             print('>>>>')
73     except Exception as e:
74         print(e)
75 
76 
77 def main(self):
78     print('开始抓取数据')
79     self.get_movies_info()
80     self.insert_into_sql()
81     print('数据抓取完毕')
82 
83 
84 b = MoviesTop()
85 b.main()
View Code

 

posted @ 2017-12-22 23:19  cerofang  阅读(215)  评论(0编辑  收藏  举报