基于python的爬虫(一)
一.爬虫的基本流程:
# 1、发起请求: 使用http库向目标站点发起请求,即发送一个Request Request包含:请求头、请求体等 # 2、获取响应内容 如果服务器能正常响应,则会得到一个Response Response包含:html,json,图片,视频等 # 3、解析内容 解析html数据:正则表达式,第三方解析库如Beautifulsoup,pyquery等 解析json数据:json模块 解析二进制数据:以b的方式写入文件 # 4、保存数据 数据库 文件
二.我们来爬一个校花网
import requests import re # 爬虫三部曲 # 一 发送请求 def get_page(url): index_res = requests.get(url) return index_res.text # 二 解析数据 # 解析主页 def parse_index(index_page): detail_urls = re.findall('<div class="items">.*?href="(.*?)"', index_page, re.S) # print(detail_urls) for detail_url in detail_urls: if not detail_url.startswith('http'): detail_url = 'http://www.xiaohuar.com' + detail_url yield detail_url # 解析详情页 def parse_detail(detail_page): video_urls = re.findall('id="media".*?src="(.*?)"', detail_page, re.S) if video_urls: video_urls = video_urls[0] if video_urls.endswith('.mp4'): yield video_urls # print(video_urls) # 三 保存数据 import uuid def save_video(video_url): try: res = requests.get(video_url) with open(r'D:\pachong\movies\%s.mp4' % uuid.uuid4(), 'wb') as f: f.write(res.content) f.flush() except Exception: pass if __name__ == '__main__': base_url = 'http://www.xiaohuar.com/list-3-{}.html' for line in range(5): index_url = base_url.format(line) index_page = get_page(index_url) detail_urls = parse_index(index_page) for detail_url in detail_urls: detail_page = get_page(detail_url) video_urls = parse_detail(detail_page) for video_url in video_urls: save_video(video_url)
并发版:
# pip3 install requests import requests import re from concurrent.futures import ThreadPoolExecutor pool = ThreadPoolExecutor(50) # 爬虫三部曲 # 一 发送请求 def get_page(url): print('%s GET start ...' % url) index_res = requests.get(url) return index_res.text # 二 解析数据 # 解析主页 def parse_index(index_page): # 拿到主页的返回结果 res = index_page.result() detail_urls = re.findall('<div class="items">.*?href="(.*?)"', res, re.S) # print(detail_urls) for detail_url in detail_urls: if not detail_url.startswith('http'): detail_url = 'http://www.xiaohuar.com' + detail_url pool.submit(get_page, detail_url).add_done_callback(parse_detail) # yield detail_url # 解析详情页 def parse_detail(detail_page): res = detail_page.result() video_urls = re.findall('id="media".*?src="(.*?)"', res, re.S) if video_urls: video_urls = video_urls[0] if video_urls.endswith('.mp4'): pool.submit(save_video, video_urls) # print(video_urls) # 三 保存数据 import uuid def save_video(video_url): try: res = requests.get(video_url) with open(r'D:\tank\day01\movies\%s.mp4' % uuid.uuid4(), 'wb') as f: f.write(res.content) f.flush() print('%s done ...' % video_url) except Exception: pass if __name__ == '__main__': base_url = 'http://www.xiaohuar.com/list-3-{}.html' for line in range(5): index_url = base_url.format(line) pool.submit(get_page, index_url).add_done_callback(parse_index)
三.request的基本使用
1.get请求的两种方式:
import requests from urllib.parse import urlencode # 请求url base_url = 'https://www.baidu.com/s?' + urlencode({"wd": "美女"}) # 请求头 headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0' } # 请求方法 GET res = requests.get(base_url, headers=headers) # print(res) 一个response对象 # print(res.text) 整个html文本 # print(res.content) 二进制内容with open('meinv.html', 'w', encoding='utf-8') as f: f.write(res.text)
每次url编码会很麻烦,所以可以在GET内添加参数即可:
import requests # 请求url base_url = 'https://www.baidu.com/s?' # # 请求头 headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0' } # # 请求方法 GET res = requests.get(base_url, headers=headers, params={"wd": "黄云"}) with open('小云云.html', 'w', encoding='utf-8') as f: f.write(res.text)
get请求访问知乎:
# 访问知乎 # 请求url zhi_url = 'https://www.zhihu.com/explore' # # 请求头 headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0' } # 请求方法 GET res = requests.get(zhi_url, headers=headers) with open('知乎.html', 'w', encoding='utf-8') as f: f.write(res.text)
get请求访问github:
# # 请求头,登录后的主页 url='https://github.com/settings/emails' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36', 'Cookie': 'has_recent_activity=1; _ga=GA1.2.1150787574.1561264746; _octo=GH1.1.800236184.1561264778; _device_id=e38cc770a7f91ac7001f3b1e23185943; user_session=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; __Host-user_session_same_site=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; logged_in=yes; dotcom_user=pengsima; _gat=1; tz=Asia%2FShanghai; _gh_sess=U0hueWR2WmcvMEJ3amVCTFpOVm5KUDFob1FQUHBtd1BYK09ENkU0aTBqK1JrYmFiYTd6K3pLb0pSVDV5UzdOU0oxbGluSDR3dmVJYTA3WlVpaHZ2cWJmQTJrVTQzRHVFa1cvT1hrWG1ON1ZMRm1DeEtkQkhDRUVaK2cwUUpRN29UUnlyWnRCODQ3cTRLYWZkcmN5UHdnPT0tLUgxSmxJMUQzWDllblhFT3JMK083Tnc9PQ%3D%3D--92e621b5b1d19cf03e157bf61e02ded6a1a248c6' } # # 请求头,email headers_2 = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36', 'Cookie':'has_recent_activity=1; _ga=GA1.2.1150787574.1561264746; _octo=GH1.1.800236184.1561264778; _device_id=e38cc770a7f91ac7001f3b1e23185943; user_session=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; __Host-user_session_same_site=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; logged_in=yes; dotcom_user=pengsima; _gat=1; tz=Asia%2FShanghai; _gh_sess=SE5mdjlBaWtla3B2czNYZFI5UTF6TEhUbERvellXVTZnUVE3d0hjTDBTb3RtZ0UxTXhYSCt4S2h2NXR2c3h2YVNaZUNITHlCOE9GcmhIM2lweVFVellYMExxV3dEK0R1ZU15cUEycmxIRk4yZW1WT2J5c3hFVHZ4Y3ZOaUhBN0ZseWcyTmMwNWxPTEIrMmpnVVpKRUJRPT0tLTdNcFZsOTFidnpxZk05cWVZUmV0MkE9PQ%3D%3D--6064098de4400f5a7ac71cdd3806abd11b2a0134' } # 请求方法 GET # res = requests.get(url, headers=headers_2) res = requests.get(url, headers=headers) with open('github.html', 'w', encoding='utf-8') as f: f.write(res.text) print('1059239165' in res.text) # True
2.post请求
# 第一步 https://github.com/login >>>> 获取tocken headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0' } login_res = requests.get('https://github.com/login', headers=headers) # authenticity_token = re.findall('name="authenticity_token".*?value="(.*?)"', login_res.text, re.S)[0] print( authenticity_token ) # 第二步拿到cookies cookies = {} # 把login_cookies放进cookies字典内 cookies.update(login_res.cookies.get_dict()) print(cookies) # 第三步 往session发送post请求 # 请求方法 POST # 请求url # https://github.com/session POST # 请求体 form_data = { "commit": "Sign in", "utf8": "✓", "authenticity_token": authenticity_token, "login": "pengsima", "password": "oa09116611", "webauthn-support":" supported" } # json # requests.post('https://github.com/session', headers=headers, json=form_data) res = requests.post('https://github.com/session', headers=headers, data=form_data, cookies=cookies) # print(res.status_code) with open('github.html', 'w', encoding='utf-8') as f: f.write(res.text)
3.response
import requests baidu = 'https://www.baidu.com/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0' } res = requests.get(baidu, headers=headers) # 返回响应状态码 print(res.status_code) print(res) # 响应头 print(res.headers) # 响应文本 print(res.text) print(res.url) # print(res.cookies) print(res.cookies.get_dict()) print(res.encoding) # res.encoding = 'utf-8' # print(res.encoding) print(res.history) print(res.content)
下载一张图片:
bo = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1551942493340&di=afa19a1f5a3a4fbdec983baaeb1954f0&imgtype=0&src=http%3A%2F%2Fwww.xnnews.com.cn%2Fwenyu%2Flxsj%2F201611%2FW020161114828261827516.jpg' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0' } res = requests.get(bo, headers=headers, stream=True) with open('bo2.jpg', 'wb') as f: for line in res.iter_content(): # f.write(res.content) f.write(line)
补充:
取消重定向(默认为True):
allow_redriects=False
4.session用法:
import requests import re session = requests.session() # 一 往login发送get请求: 获取token ''' name="authenticity_token" value="/pE5/yY3Ibm1z0CgiSrqZheBOGQl+rPLs491/TOUL0sRIaQFQzS/s/er5eC/xxEO2AGY0l39b0rEStW/A6Bngg==" ''' headers = { 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Mobile Safari/537.36', # 'Cookies' } login_res = session.get('https://github.com/login', headers=headers) authenticity_token = re.findall('name="authenticity_token".*?value="(.*?)"', login_res.text, re.S)[0] # 二 往session发送post请求 # 请求方法 POST # 请求url # https://github.com/session POST # 请求体 form_data = { "commit": "Sign in", "utf8": "✓", "authenticity_token": authenticity_token, "login": "hdjasbfsas", "password":"yy9797910", } res = session.post('https://github.com/session', headers=headers, data=form_data) # print('pengsima' in res.text) print(res.status_code) # print(res.text) # with open('github.html', 'w', encoding='utf-8') as f: f.write(res.text)
5.json格式反序化:
import requests import json res = requests.get('https://www.toutiao.com/stream/widget/local_weather/city/') print(res.text) # jason反序列化的两种方式 print(json.loads(res.text)) print(res.json())
补充:
''' requests高级用法 了解! ''' import requests # SSL res = requests.get('https://www.xiaohuar.com/') print(res.text) # 改进一: 无视证书 res = requests.get('https://www.xiaohuar.com/', verify=False) print(res.text) # 改进二:取消警告 import urllib3 urllib3.disable_warnings() res = requests.get('https://www.xiaohuar.com/', verify=False) print(res.text) # 改进三:添加证书 import urllib3 urllib3.disable_warnings() res = requests.get('https://www.xiaohuar.com/', verify=False, cert=('/path/server.crt', '/path/key')) print(res.text) # 使用代理 res = requests.get('https://www.baidu.com/', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' }, # proxies={ # 'http': 'http://112.85.130.66:9999', # # 'https': 'https://112.85.130.66:9999', # } proxies={ 'sock': 'sock://ip:port' }) print(res.text) # 超时设置(超时报错) import requests respone=requests.get('https://www.baidu.com', timeout=0.0001) print(respone.text) # 认证 import requests from requests.auth import HTTPBasicAuth r=requests.get('xxx', auth=HTTPBasicAuth('user','password')) print(r.status_code) # 上传文件 import requests files = {'file': open('a.jpg', 'rb')} response = requests.post('http://httpbin.org/post', files=files) print(response.status_code)