python爬虫学习(一)

第一个爬虫:

import urllib
from urllib import request 

response = request.urlopen('http://www.baidu.com')
html = response.read().decode()
print (html)
    

伪造浏览器:


from urllib import request
import re
url=r"http://www.baidu.com/"
header={"User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/16A366 MicroMessenger/6.7.3(0x16070321) NetType/WIFI Language/zh_CN"}
req=request.Request(url,headers=header)
response= request.urlopen(req).read().decode() #
pat=r"<title>(.*?)</title>" #
data=re.findall(pat,response)
print(data)

 

 设置代理:

from urllib import request
import random
proxylist=[{"http":"120.83.109.103:9999"},
{"http":"120.83.109.103:9999"},
{"http":"120.83.109.103:9999"},
{"http":"120.83.109.103:9999"},
]
proxy=random.choice(proxylist)
print(proxy)
#构建代理处理器对象
proxyHandler=request.ProxyHandler(proxy)

#创建请求对象
http_handler=request.HTTPHandler()
opener=request.build_opener(proxyHandler)
req=request.Request("http://www.baidu.com")
res=opener.open(req)
print(res.read().decode())

爬取贴吧内容:

from urllib import request
import urllib
import time
import os

#发送请求,获取服务器响应文件
def loadPage(fullurl,filename):
    print("正在下载:",filename)
    header={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
    req=request.Request(fullurl,headers=header)
    res=request.urlopen(req).read()
    txt=request.urlopen(req).read().decode()
    return res
    
  
#将url写入本地
def writePage(html,filename):
    print("正在保存:",filename)
    with open(filename,"wb") as f:
        f.write(html)
        print(".....")
        
    
#处理每个页面URL
def tiebaSpider(url,begin,end):
    for page in range(begin,end+1):
        pn=(page-1)*50
        fullurl=url+"&pn="+str(pn)
        print(fullurl)
        filename="C:/第"+str(page)+"页.html"
        html=loadPage(fullurl,filename)#调用爬虫,爬取网页
        writePage(html,filename)#将获取网页信息写入本地


if __name__ == '__main__':
    kw=input("请输入贴吧名:")
    begin=int(input("请输入开始页:"))
    end=int(input("请输入结束页:"))
    url="http://tieba.baidu.com/f?"
    key=urllib.parse.urlencode({"kw":kw})
    url=url+key
    tiebaSpider(url,begin ,end)
    time.sleep(6)

get 请求:

from urllib import request
import urllib

http_handler=request.HTTPHandler()
opener=request.build_opener(http_handler)
wd={"wd":"北京"}
wdd=urllib.parse.urlencode(wd)
print(wdd)
url="http://www.baidu.com/s?"
url=url+wdd
print(url)
req=request.Request(url)
print(req)
res=opener.open(req)
print(res.read().decode())

 

post 请求:

from urllib import request
import urllib
import re

header={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
url="http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule"#url要去掉_o
url1="http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"
key="中国"
formdata={
"i":key,
"from":"AUTO",
"to":"AUTO",
"smartresult":"dict",
"client":"fanyideskweb",
"salt":"15651639810027",
"sign":"7688b75dad2fed75aea5924b1f8ee127",
"ts":"1565163981002",
"bv":"62188471f020213764ab67d1893204f7",
"doctype":"json",
"version":"2.1",
"keyfrom":"fanyi.web",
"action":"FY_BY_REALTlME"
}
data=urllib.parse.urlencode(formdata).encode("utf-8")
req=request.Request(url=url1,data=data,headers=header)
resp=request.urlopen(req).read().decode()
pat=r'"tgt":"(.*?)"}]]}'
result=re.findall(pat,resp)
print(result[0])

异常处理:

from urllib import request
list1 =["www.baidu.com",
"http://www.baidu.com",
"http://www.baidu.cm0",
"http://www.baidu.com",
"http://www.baidu.com"]


i=0 
for url in list1:
    i=i+1
    try:
        request.urlopen(url)
    except Exception as e:
             print(e) 
    print("",i,"个请求完成")

 

posted @ 2019-08-14 15:21  我叫酸菜鱼  阅读(287)  评论(0编辑  收藏  举报