爬虫第一天

import urllib.request
import urllib.parse

# url = 'http://www.baidu.com'
# response = urllib.request.urlopen(url=url)
# with open('baidu.html', 'w', encoding='utf-8')as fp:
# print(response.read.decode())
# # 读取相应内容是字节类型的然后.decode转换成字符串
# print(response.geturl())
# # 获取请求的url
# print(dict(response.getheaders()))
# # 获取头部信息,列表里面有元祖
# print(response.getcode())
# #获取状态码
# print(response.readlines())
# #
# fp.write(response.read().decode())
#


# #下载图片 图像只能写入本地二进制的格式
# img_url='http://'
# response = urllib.request.urlopen(img_url)
# with open('qing.jpg','wb')as fp:
# fp.write(response.read())
# # 直接把图片写进文件里
# urllib.request.urlretrieve(img_url,'chun.jpg')
# url = 'http://www.baidu.com/index.html?name=狗蛋&pwd=123456'
# ret=urllib.parse.quote(url)#编码函数
# re = urllib.parse.unquote(ret)#解码函数
# print(ret)
#
# name='good'
# age=18
# sex='sex'
# data = {
# 'name':name,
# 'age':age,
# 'sex':sex
# }
#给一个字典,将字典拼接为query_string,并且实现了编码的功能
# query_string = urllib.parse.urlencode(data)
# print(query_string)
# #遍历字典
# It=[]
# for k,v in data.items():
# It.append(k + '=' + str(v))
# query_string = '&'.join(It)
# url = url + '?'+query_string




# get方式
# word = input('请输入您想要搜索的内容:')
# url='http://www.baidu.com/s?'#不能用https访问
# data ={
# 'ie':'utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&',
# 'wd': word,
#
# }
# s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=美国
# s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=中国
# query_string = urllib.parse.urlencode(data)
# url+=query_string
# response = urllib.request.urlopen(url)
# filename=word + '.html'
# print(filename)
# with open(filename,'wb')as fp:
# fp.write(response.read())



# 构建请求头部信息,(这是反爬第一步)
# 伪装自己的ua,让服务器以为是你]是浏览器在上网
# url='http://www.baidu.com'
# response = urllib.request.urlopen(url)
# print(response.read().decode())
# # 自己要伪装的头部
# headers={
# 'User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;'
# ' 360SE)',
# }
# #构建请求对象
# request = urllib.request.Request(url=url,headers=headers)
# response = urllib.request.urlopen(request)
# print(response.read())
posted @ 2019-04-03 15:36  呼呼嘻嘻  阅读(114)  评论(0编辑  收藏  举报