python爬虫学习(一)
第一个爬虫:
import urllib from urllib import request response = request.urlopen('http://www.baidu.com') html = response.read().decode() print (html)
伪造浏览器:
from urllib import request
import re
url=r"http://www.baidu.com/"
header={"User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/16A366 MicroMessenger/6.7.3(0x16070321) NetType/WIFI Language/zh_CN"}
req=request.Request(url,headers=header)
response= request.urlopen(req).read().decode() #
pat=r"<title>(.*?)</title>" #
data=re.findall(pat,response)
print(data)
设置代理:
from urllib import request import random proxylist=[{"http":"120.83.109.103:9999"}, {"http":"120.83.109.103:9999"}, {"http":"120.83.109.103:9999"}, {"http":"120.83.109.103:9999"}, ] proxy=random.choice(proxylist) print(proxy) #构建代理处理器对象 proxyHandler=request.ProxyHandler(proxy) #创建请求对象 http_handler=request.HTTPHandler() opener=request.build_opener(proxyHandler) req=request.Request("http://www.baidu.com") res=opener.open(req) print(res.read().decode())
爬取贴吧内容:
from urllib import request import urllib import time import os #发送请求,获取服务器响应文件 def loadPage(fullurl,filename): print("正在下载:",filename) header={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"} req=request.Request(fullurl,headers=header) res=request.urlopen(req).read() txt=request.urlopen(req).read().decode() return res #将url写入本地 def writePage(html,filename): print("正在保存:",filename) with open(filename,"wb") as f: f.write(html) print(".....") #处理每个页面URL def tiebaSpider(url,begin,end): for page in range(begin,end+1): pn=(page-1)*50 fullurl=url+"&pn="+str(pn) print(fullurl) filename="C:/第"+str(page)+"页.html" html=loadPage(fullurl,filename)#调用爬虫,爬取网页 writePage(html,filename)#将获取网页信息写入本地 if __name__ == '__main__': kw=input("请输入贴吧名:") begin=int(input("请输入开始页:")) end=int(input("请输入结束页:")) url="http://tieba.baidu.com/f?" key=urllib.parse.urlencode({"kw":kw}) url=url+key tiebaSpider(url,begin ,end) time.sleep(6)
get 请求:
from urllib import request
import urllib
http_handler=request.HTTPHandler() opener=request.build_opener(http_handler) wd={"wd":"北京"} wdd=urllib.parse.urlencode(wd) print(wdd) url="http://www.baidu.com/s?" url=url+wdd print(url) req=request.Request(url) print(req) res=opener.open(req) print(res.read().decode())
post 请求:
from urllib import request import urllib import re header={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"} url="http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule"#url要去掉_o url1="http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule" key="中国" formdata={ "i":key, "from":"AUTO", "to":"AUTO", "smartresult":"dict", "client":"fanyideskweb", "salt":"15651639810027", "sign":"7688b75dad2fed75aea5924b1f8ee127", "ts":"1565163981002", "bv":"62188471f020213764ab67d1893204f7", "doctype":"json", "version":"2.1", "keyfrom":"fanyi.web", "action":"FY_BY_REALTlME" } data=urllib.parse.urlencode(formdata).encode("utf-8") req=request.Request(url=url1,data=data,headers=header) resp=request.urlopen(req).read().decode() pat=r'"tgt":"(.*?)"}]]}' result=re.findall(pat,resp) print(result[0])
异常处理:
from urllib import request list1 =["www.baidu.com", "http://www.baidu.com", "http://www.baidu.cm0", "http://www.baidu.com", "http://www.baidu.com"] i=0 for url in list1: i=i+1 try: request.urlopen(url) except Exception as e: print(e) print("第",i,"个请求完成")