from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
html = urlopen("https://morvanzhou.github.io/static/scraping/table.html").read().decode('utf-8')
# print(html)
soup = BeautifulSoup(html, features='lxml')
print(soup.h1)
# <h1>标题</h1>
print(soup.p)
# <p>段落</p>
# 爬取全部链接
all_href = soup.find_all('a')
all_href = [l['href'] for l in all_href]
print('\n', all_href)
# 利用Class爬取信息
month = soup.find_all('li', {"class": "month"})
for m in month:
print(m)
# <li class="month">XXX</li>
print(m.get_text())
# XXX
# 用正则表达式限制,爬取图片
img_links = soup.find_all("img", {"src": re.compile('.*?\.jpg')}) # 以任意字符开头,.jpg结尾
print(img_links)
# [<img src="https://morvanzhou.github.io/static/img/course_cover/tf.jpg"/>]
for link in img_links:
print(link['src'])
# https://morvanzhou.github.io/static/img/course_cover/tf.jpg
# 用正则表达式限制,爬取链接
course_links = soup.find_all('a', {'href': re.compile('https://morvan.*')})
print(course_links)
# [<a href="https://morvanzhou.github.io/">莫烦 Python</a>]
for link in course_links:
print(link['href'])
# https://morvanzhou.github.io/
import requests
import webbrowser
# get
param = {"wd": "莫烦Python"} # 搜索的信息
r = requests.get('http://www.baidu.com/s', params=param)
print(r.url)
# http://www.baidu.com/s?wd=%E8%8E%AB%E7%83%A6Python
webbrowser.open(r.url)
# post
data = {'firstname': '莫烦', 'lastname': '周'} # 提交的信息
r = requests.post('http://pythonscraping.com/files/processing.php', data=data)
print(r.text)
# Hello there, 莫烦 周!
# 上传图片
file = {'uploadFile': open('./image.png', 'rb')}
r = requests.post(
'http://pythonscraping.com/files/processing2.php', files=file)
print(r.text)
# The file image.png has been uploaded.
# session 登录操作
session = requests.Session()
payload = {'username': 'Morvan', 'password': 'password'}
r = session.post(
'http://pythonscraping.com/pages/cookies/welcome.php', data=payload)
print(r.cookies.get_dict())
# {'username': 'Morvan', 'loggedin': '1'}
r = session.get("http://pythonscraping.com/pages/cookies/profile.php")
print(r.text)
# Hey Morvan! Looks like you're still logged into the site!