一、爬虫基本知识
1)基本爬虫知识,爬取图片
# 第一步获取网页文本信息 import requests response = requests.get(url='http://...') #...... # 第二步 ,对文本信息进行正则匹配 from bs4 import BeautifulSoup soup = BeautifulSoup(response.text,features='html.parser') # features='lxml' # 性能稍好 # ..... # 第三步 保存内容到本地 file_name = str(uuid.uuid1()) + '.jpg' with open(file_name,'wb') as f: f.write(img_response.content)
1.1)爬取图片
# 第一步获取网页文本信息 import requests response = requests.get( url='http://www.autohome.com.cn/news' ) # print(response.text) # 对于编码问题 # 第一种,修改编码为 gbk # response.encoding = 'gbk' # print(response.content) # 第二种 ,response 反解编码 response.encoding = response.apparent_encoding # print(response.text) # 第二步 ,对文本信息进行正则匹配 from bs4 import BeautifulSoup soup = BeautifulSoup(response.text,features='html.parser') # features='lxml' # 性能稍好 target =soup.find(id='auto-channel-lazyload-article') # 寻找 id 为 它 的标签 # print(target) li_list = target.find_all('li') # 寻找 id 标签 下的所有 li 标签 获取到的是列表类型 # print(li_list) for i in li_list: a = i.find('a') # 循环列表li标签,找到下的所有 a 标签 # print(a) # print(a.attrs) # 寻找 a 标签的属性 if a: #print(a.attrs.get('href')) # 寻找 href 属性 # txt = a.find('h3') # 类型是bs4.。的对象 txt = a.find('h3').text # 获得文本内容 #print(txt) img = a.find('img').attrs.get('src') print(img) img_head = 'http:' img_url = img_head + img print(img_url,type(img_url)) # 第三步 保存内容到本地 # img_response = requests.get(url=img_url) img_response = requests.get(url=img_url) print(img_response) import uuid file_name = str(uuid.uuid1()) + '.jpg' with open(file_name,'wb') as f: f.write(img_response.content)
1.2)爬取图片流程方法总结
1. requests pip3 install requests response = requests.get('http://www.autohome.com.cn/news/') response.text 总结: response = requests.get('URL') response.text response.content response.encoding response.aparent_encoding response.status_code response.cookies.get_dict() 2. beautisoup模块 pip3 install beautifulsoup4 from bs4 import BeautiSoup soup = BeautiSoup(response.text,features='html.parser') target = soup.find(id='auto-channel-lazyload-article') print(target) 总结: soup = beautifulsoup('<html>...</html>',features='html.parser') v1 = soup.find('div') v1 = soup.find(id='i1') v1 = soup.find('div',id='i1') v2 = soup.find_all('div') v2 = soup.find_all(id='i1') v2 = soup.find_all('div',id='i1') obj = v1 obj = v2[0] obj.text obj.attrs
2)请求头,伪造浏览器信息。基本所有的网站都要添加请求头才能访问
2.1) 未添加请求头时
import requests post_dict = { 'phone':'1111111', 'password':'1111111', 'oneMonth':1 } response = requests.post( url='https://dig.chouti.com/login', data = post_dict, ) print(response)
<Response [403]> 拒绝访问
2.2)添加了请求头时
import requests headers = { 'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding':'gzip, deflate, br', 'accept-language':'zh-CN,zh;q=0.9', 'cache-control':'max-age=0', 'cookie':'__guid=9498528.323570739154576450.1534233211072.796; gpid=7277f557513c4639b2928de61853af52; gpsd=72d76079e281788ad0075160a54990a9; JSESSIONID=aaaSaxN93bwl6gJQkLgxw; monitor_count=2', 'upgrade-insecure-requests':'1', 'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } post_dict = { 'phone':'1111111', 'password':'1111111', 'oneMonth':1 } response = requests.post( url='https://dig.chouti.com/login', data = post_dict, headers=headers ) print(response)
<Response [200]> 被允许访问
二、爬出应用
1)爬取猫眼电影并存储文本信息:http://maoyan.com/board/4?offset=0
import random import requests import re hds = [{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}, {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'}, {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}, {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'}, {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'}, { 'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'}, {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}, {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}, { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'}, {'User-Agent': 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'}, {'User-Agent': 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}] print(len(hds)) print(random.randint(0, len(hds) -1)) print(hds[random.randint(0, len(hds) - 1)]) # 随机写请求头 def get_page(url): # 对url的请求处理,获取返回的字符串 # url = "http://maoyan.com/board/4?offset=10" headers = hds[random.randint(0, len(hds) - 1)] # 随机寻找请求头 response = requests.get(url, headers=headers) try: if response.status_code == 200: res = response.text return res return None except Exception as e: print(e) ''' <div class="board-item-content"> <div class="movie-item-info"> <p class="name"><a href="/films/1203" title="霸王别姬" data-act="boarditem-click" data-val="{movieId:1203}">霸王别姬</a></p> <p class="star"> 主演:张国荣,张丰毅,巩俐 </p> <p class="releasetime">上映时间:1993-01-01(中国香港)</p> </div> <div class="movie-item-number score-num"> <p class="score"><i class="integer">9.</i><i class="fraction">6</i></p> </div> </div> ''' def get_movie(html): # 获取需要的文本信息 # 霸王别姬 主演:张国荣,张丰毅,巩俐 上映时间:1993-01-01 # partten = '<p.*?><a.*?> (.*?) </a></p>.*?<p.*?> (.*?) </p>.*?<p.*?> (.*?) </p>' partten = '<p.*?><a.*?>(.*?)</a></p>.*?<p.*?>(.*?)</p>.*?<p.*?>(.*?)</p>' items = re.findall(partten, html, re.S) print((items),'-=====') # 获取到p 标签的文本内容 : ('霸王别姬', '\n 主演:张国荣,张丰毅,巩俐\n ', '上映时间:1993-01-01'), return items def write_file(items): # 写入文本信息 fileMovie = open('movie.txt', 'a+', encoding='utf8') try: for movie in items: fileMovie.write('电影排名:' + movie[0] + '\r\n') fileMovie.write('电影主演:' + movie[1].strip() + '\r\n') fileMovie.write('上映时间:' + movie[2] + '\r\n\r\n') print('文件写入成功...') finally: fileMovie.close() def main(url): html = get_page(url) # 返回请求的字符串内容 items = get_movie(html) write_file(items) # if __name__ == '__main__': # for i in range(0, 100, 10): # url = "http://maoyan.com/board/4?offset=" + str(i) # print(url) # main(url) if __name__ == '__main__': url= "http://maoyan.com/board/4?offset=0" main(url)
2)之上,爬起电影存储的多种方式
1. 存入普通文件
2. 存入非关系型数据库中,进行操作
3. 存入到excel
import random import requests import re import pymongo from pymongo import MongoClient import pandas as pd hds = [{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}, { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'}, {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}, {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'}, { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'}, { 'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'}, {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}, {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}, { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'}, {'User-Agent': 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'}, {'User-Agent': 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}] def get_page(url): headers = hds[random.randint(0, len(hds) - 1)] response = requests.get(url, headers=headers) try: if response.status_code == 200: res = response.text return res return None except Exception as e: print(e) ''' <div class="board-item-content"> <div class="movie-item-info"> <p class="name"><a href="/films/1203" title="霸王别姬" data-act="boarditem-click" data-val="{movieId:1203}">霸王别姬</a></p> <p class="star"> 主演:张国荣,张丰毅,巩俐 </p> <p class="releasetime">上映时间:1993-01-01(中国香港)</p> </div> <div class="movie-item-number score-num"> <p class="score"><i class="integer">9.</i><i class="fraction">6</i></p> </div> </div> ''' def get_movie(html): # 正则获取电影的信息,并存为列表 partten = '<p.*?><a.*?>(.*?)</a></p>.*?<p.*?>(.*?)</p>.*?<p.*?>(.*?)</p>' items = re.findall(partten, html, re.S) # print((items)) movies_list = [] for movie in items: info = { '电影名': movie[0], '主演': movie[1].strip()[3:], '上映时间': movie[2][5:] } movies_list.append(info) print(movies_list,'movies_list') return movies_list # def save_to_mongodb(movie): # conn = pymongo.MongoClient('localhost', ) # db = conn['movies'] # use movies # # if db['movieList'].insert(movie): # print('success...') # else: # print('error..') def save_to_csv(movie): df = pd.DataFrame(movie) df.to_csv('movies') # def write_file(items): # fileMovie = open('movie.txt', 'a+', encoding='utf8') # try: # for movie in items: # fileMovie.write('电影排名:' + movie[0] + '\r\n') # fileMovie.write('电影主演:' + movie[1].strip() + '\r\n') # fileMovie.write('上映时间:' + movie[2] + '\r\n\r\n') # print('文件写入成功...') # finally: # fileMovie.close() def main(url): html = get_page(url) # 请求返回的html字符串 print(html) items = get_movie(html) # 1. 存入文件 # write_file(items) # 2. 存入非关系型数据库中,进行操作 # save_to_mongodb(items) # 3. 存入到excel save_to_csv(items) if __name__ == '__main__': url = "http://maoyan.com/board/4?offset=0" print(url) main(url)
3)模拟登陆github
import re import requests from bs4 import BeautifulSoup r1 = requests.get('https://github.com/login') # authenticity_token 获取的2种方式 soup = BeautifulSoup(r1.text, 'html.parser') res = soup.find('input', attrs={'name':'authenticity_token'}) # res2=re.findall(r'name="authenticity_token".*?value="(.*?)"',r1.text)[0] #从页面中拿到CSRF TOKEN print(res.attrs['value']) r1_cookies = r1.cookies.get_dict() print(r1_cookies) # {'logged_in': 'no', '_gh_sess': 'bHlIxa26', 'has_recent_activity': '1'} data = { "commit": "Sign in", "utf8":"✓", "authenticity_token": res.attrs['value'], "login": "loverying", "password": "caojing825" } r2 = requests.post('https://github.com/session', data = data, cookies = r1_cookies) r2_cookies = r2.cookies.get_dict() r3 = requests.get('https://github.com/settings/emails', cookies = r2_cookies) # print(r3.text) with open("result2.html","wb") as f: f.write(r2.content) # 生产网页