Python 15 爬虫(一)
https://www.cnblogs.com/wupeiqi/articles/6283017.html
一、requests模块
requests.post( url = "xxx", header = {xxx:xxx}, cookies = {}, params = {} #这是加在url后面的参数 data = {}, json = {} #data和json都是发送请求体中的数据,但是json会把数据自动加工成json格式的数据 ######代理###### proxies = { #根据http和https向代理网站放松请求 "http": "xxxxx", #或者指定某个网站跳转到另一个网站 "https:xxxxx": "xxxxx" "https": "xxxxx" }, auth = HTTPProxyAuth("用户名", "密码"), ######文件上传###### files = {"f1": open("xxx", "rb")} ######超时时间###### timeout = 1 #如果只有一个表示连接的等待时间 timeout = (5,1) #表示连接的时间和等待返回结果的时间 ######允许重定向###### allow_redirects = False ######大文件下载###### stream = True #一点一点下载 )
二、BeautifulSoup模块
1 from bs4 import BeautifulSoup 2 3 html_doc = """ 4 <html><head><title>The Dormouse's story</title></head> 5 <body> 6 asdf 7 <div class="title"> 8 <b>The Dormouse's story总共</b> 9 <h1>f</h1> 10 </div> 11 <div class="story">Once upon a time there were three little sisters; and their names were 12 <a class="sister0" id="link1">Els<span>f</span>ie</a>, 13 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 14 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 15 and they lived at the bottom of a well.</div> 16 ad<br/>sf 17 <p class="story">...</p> 18 </body> 19 </html> 20 """ 21 22 soup = BeautifulSoup(html_doc, features="lxml") 23 # 找到第一个a标签 24 tag1 = soup.find(name='a') 25 # 找到所有的a标签 26 tag2 = soup.find_all(name='a') 27 # 找到id=link2的标签 28 tag3 = soup.select('#link2')
三、长轮询
实现机制:利用队列,如果队列为空则夯住,超出时间则再次请求
创建一个存放队列的字典,每一个请求到来时,在字典中创建这个用户的键值对,值就是属于他的队列,如果有人投票,则循环整个字典,给每个队列都添加新的数据。每个五秒发一次请求,如果没人投票则一直等待,直到五秒后在此发送请求。
import queue import threading from flask import Blueprint, render_template, request, jsonify, session t = Blueprint("t", __name__) uuid = threading.get_ident() vote_count = { 1: {"name": "李静", "count": 1}, 2: {"name": "胡岳枫", "count": 2} } queue_dic = {} #队列列表 @t.route("/user_info", methods=["GET", "POST"]) def user_info(): queue_dic[uuid] = queue.Queue() return render_template("user_info.html", vote_count=vote_count) @t.route("/vote", methods=["GET","POST"]) def vote(): user_id = int(request.form.get("user_id")) vote_count[user_id]["count"] += 1 for v in queue_dic.values(): v.put(vote_count) return "success" @t.route("/get_vote") def get_vote(): ret = {"status": True, "data": None} try: val = queue_dic[uuid].get(timeout=5) ret["data"] = val except: ret["status"] = False return jsonify(ret)
一、requests模块
1.基本使用
import requests url = 'xxxx' params = {'xxx': 'xxx'} headers = {'User-Agent': 'xxx'} response = requests.get(url=url, params=params, headers=headers) response.text # 纯文本html