Python爪巴虫

from bs4 import BeautifulSoup
from urllib.request import urlopen
import re

html = urlopen("https://morvanzhou.github.io/static/scraping/table.html").read().decode('utf-8')
# print(html)

soup = BeautifulSoup(html, features='lxml')

print(soup.h1)
# <h1>标题</h1>
print(soup.p)
# <p>段落</p>

# 爬取全部链接
all_href = soup.find_all('a')
all_href = [l['href'] for l in all_href]
print('\n', all_href)

# 利用Class爬取信息
month = soup.find_all('li', {"class": "month"})
for m in month:
    print(m)
    # <li class="month">XXX</li>
    print(m.get_text())
    # XXX

# 用正则表达式限制,爬取图片
img_links = soup.find_all("img", {"src": re.compile('.*?\.jpg')}) # 以任意字符开头,.jpg结尾

print(img_links)
# [<img src="https://morvanzhou.github.io/static/img/course_cover/tf.jpg"/>]

for link in img_links:
    print(link['src'])
# https://morvanzhou.github.io/static/img/course_cover/tf.jpg

# 用正则表达式限制,爬取链接
course_links = soup.find_all('a', {'href': re.compile('https://morvan.*')})
print(course_links)
# [<a href="https://morvanzhou.github.io/">莫烦 Python</a>]

for link in course_links:
    print(link['href'])
# https://morvanzhou.github.io/
import requests
import webbrowser

# get
param = {"wd": "莫烦Python"}  # 搜索的信息
r = requests.get('http://www.baidu.com/s', params=param)
print(r.url)
# http://www.baidu.com/s?wd=%E8%8E%AB%E7%83%A6Python
webbrowser.open(r.url)

# post
data = {'firstname': '莫烦', 'lastname': ''}  # 提交的信息
r = requests.post('http://pythonscraping.com/files/processing.php', data=data)
print(r.text)
# Hello there, 莫烦 周!

# 上传图片
file = {'uploadFile': open('./image.png', 'rb')}
r = requests.post(
    'http://pythonscraping.com/files/processing2.php', files=file)
print(r.text)
# The file image.png has been uploaded.

# session 登录操作
session = requests.Session()
payload = {'username': 'Morvan', 'password': 'password'}
r = session.post(
    'http://pythonscraping.com/pages/cookies/welcome.php', data=payload)
print(r.cookies.get_dict())

# {'username': 'Morvan', 'loggedin': '1'}


r = session.get("http://pythonscraping.com/pages/cookies/profile.php")
print(r.text)

# Hey Morvan! Looks like you're still logged into the site!
posted @ 2019-12-29 16:55  Junzhao  阅读(178)  评论(0编辑  收藏  举报