Python 爬虫工具

requests

$ pip install requests

import requests

kw = {'wd':'长城'}

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}

formdata = {
    "type":"AUTO",
    "i":"i love python",
    "doctype":"json",
    "xmlVersion":"1.8",
    "keyfrom":"fanyi.web",
    "ue":"UTF-8",
    "action":"FY_BY_ENTER",
    "typoResult":"true"
}

url = "http://www.baidu.com/s?"

# params 接收一个字典或者字符串的查询参数，字典类型自动转换为url编码，不需要urlencode()
resp = requests.get(url, params = kw, headers = headers)
resp = requests.post(url, data = fordata)

# 编码方式
resp.encoding = 'utf-8'

# 查看响应内容，response.text 返回的是Unicode格式的数据
print response.text

# 查看响应内容，response.content返回的字节流数据
print respones.content

re 正则表达式

import re

# re.compile 预编译正则表达式
# .*? 匹配任意长度字符串   (?P<name>.*?) name为变量名，用于提取匹配内容
# re.S 匹配包括换行符在内的任意字符
obj = re.compile(r'<li>.*?<span class="title">(?P<name>.*?)</span>'，re.S)

# obj.search 搜索符合正则规则的字符串  group("pic_url")返回一个包含"pic_url"的元组
result = obj.search(resp.text).group("pic_url")

#字符串中找到正则表达式所匹配的所有子串，并把它们作为一个迭代器返回
re.finditer(pattern, string, flags=0)

xpath 查找HTML标签

$ pip3 install lxml

from lxml import etree

# resp.text 网页文本
et = etree.HTML(resp.text)

# 查找<ul>标签 含有class='pic' 的文本内容
result = et.xpath("//ul[@class='pic']/text()")

# 查找<a>标签 包含的链接
result = et.xpath("//a/@href")

线程池

from concurrent.futures import ThreadPoolExecutor

# 最多 10 线程 
with ThreadPoolExecutor(10) as t:
    t.submit(func,arge)

posted @ 2022-02-04 15:08 Raink 阅读(66) 评论(0) 编辑收藏举报

刷新页面返回顶部

Raink

Python 爬虫工具

公告