爬虫学习3（主要内容）

# -- codeing = utf-8 --

import urllib.request

from bs4 import BeautifulSoup

baseurl = "http://news.4399.com/seer/jinglingdaquan/"

response=urllib.request.urlopen(baseurl)

#不仅仅有源码，有请求，url，状态码

content=response.read().decode('gbk')

#utf-8不行的话就用gbk

#用单引号

#打印数据

print(content);

#结果里边的b是read方法，返回字节形式的二进制数据。

#所以我们要把二进制的数据转化为字符串，叫做解码。decode.

#一个类型和6个方法

import urllib.request

url="http://news.4399.com/seer/jinglingdaquan/"

response=urllib.request.urlopen(url)

print(type(response))

#它的类型是<class 'http.client.HTTPResponse'>，HTTPresponse类型。

#上面的content是一个字节一个字节的读取。

#如果在content=response.read()加入5，意味着它可以读取五个字节。

#content=response.readline()只能读一行，但是很快

#content=response.readlines()是一行一行的读，到读完。

#可以用返回状态码判断代码有没有问题

#print(response.getcode())

#返回200就是正确。

#print(response.geturl)是返回地址

#print(response.getheaders)返回状态信息

#爬取的东西下载到本地

#网页，视频，图片

import urllib.request

url_page='http://news.4399.com/seer/jinglingdaquan'

#python中可以写变量名，也可直接写值，可以写url=url_page，也可以直接写url

urllib.request.urlretrieve(url_page,'4399.html')

#返回值为html类型，后缀很重要！

#请求对象的定制

import urllib.request

#url的组成

#HTTP/https 有区别 www.baidu.com是主机 s后面的是路径 wd是参数 #是错点

#协议主机端口号路径参数锚点

#端口 http为80，https为443

#当有https的时候爬取数据就会很少，因为有反爬 UA，特殊的字符串头

headers={

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36 Edg/101.0.1210.32'

}

#因为urlopen方法中不能存储字典，所以headers不能传递进去。所以需要请求对象的定制。传递url和headers

baseurl = "https://so.lenovo.com.cn/?f=synthesis"

#需要加上变量的名字，因为里面有date，url，headers，因为参数顺序问题不能直接写url和headers。

request=urllib.request.Request(url=baseurl,headers=headers)

response=urllib.request.urlopen(request)

content=response.read().decode('utf-8')

print(content)

posted @ 2022-05-04 23:28 吕洞玄阅读(32) 评论(0) 编辑收藏举报

刷新页面返回顶部

爬虫学习3（主要内容）

# -- codeing = utf-8 --

import urllib.request

from bs4 import BeautifulSoup

baseurl = "http://news.4399.com/seer/jinglingdaquan/"

response=urllib.request.urlopen(baseurl)

#不仅仅有源码，有请求，url，状态码

content=response.read().decode('gbk')

#utf-8不行的话就用gbk

#用单引号

#打印数据

print(content);

#结果里边的b是read方法，返回字节形式的二进制数据。

#所以我们要把二进制的数据转化为字符串，叫做解码。decode.

#一个类型和6个方法

import urllib.request

url="http://news.4399.com/seer/jinglingdaquan/"

response=urllib.request.urlopen(url)

print(type(response))

#它的类型是<class 'http.client.HTTPResponse'>，HTTPresponse类型。

#上面的content是一个字节一个字节的读取。

#如果在content=response.read()加入5，意味着它可以读取五个字节。

#content=response.readline()只能读一行，但是很快

#content=response.readlines()是一行一行的读，到读完。

#可以用返回状态码判断代码有没有问题

#print(response.getcode())

#返回200就是正确。

#print(response.geturl)是返回地址

#print(response.getheaders)返回状态信息

#爬取的东西下载到本地

#网页，视频，图片

import urllib.request

url_page='http://news.4399.com/seer/jinglingdaquan'

#python中可以写变量名，也可直接写值，可以写url=url_page，也可以直接写url

urllib.request.urlretrieve(url_page,'4399.html')

#返回值为html类型，后缀很重要！

#请求对象的定制

import urllib.request

#url的组成

#HTTP/https 有区别 www.baidu.com是主机 s后面的是路径 wd是参数 #是错点

#协议 主机 端口号 路径 参数 锚点

#端口 http为80，https为443

#当有https的时候爬取数据就会很少，因为有反爬 UA，特殊的字符串头

headers={

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36 Edg/101.0.1210.32'

}

#因为urlopen方法中不能存储字典，所以headers不能传递进去。所以需要请求对象的定制。传递url和headers

baseurl = "https://so.lenovo.com.cn/?f=synthesis"

#需要加上变量的名字，因为里面有date，url，headers，因为参数顺序问题不能直接写url和headers。

request=urllib.request.Request(url=baseurl,headers=headers)

response=urllib.request.urlopen(request)

content=response.read().decode('utf-8')

print(content)

公告

#协议主机端口号路径参数锚点