模拟登陆并爬取Github
因为崔前辈给出的代码运行有误,略作修改和简化了。
书上例题,不做介绍。
import requests from lxml import etree class Login(object): def __init__(self): self.headers = { 'Referer': 'https://github.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 'Host': 'github.com' } #登陆地址 self.login_url = 'https://github.com/login' #POST请求地址 self.post_url = 'https://github.com/session' #使用session保持状态,并自动处理Cookies(在访问其他子网页时,可以保持登陆,爬取网页) self.session = requests.Session() def token(self): #获取网页数据 response = self.session.get(self.login_url, headers=self.headers) #提取网页中我们需要的authenticity_token并返回 selector = etree.HTML(response.text) token = selector.xpath('//input[@name="authenticity_token"]/@value') return token def login(self, email, password): post_data = { 'commit': 'Sign in', 'utf8': '✓', 'authenticity_token': self.token(), 'login': email, 'password': password } #使用post方法模拟登陆 response = self.session.post(self.post_url, data=post_data, headers=self.headers) #登陆正常,输出登陆后的网页代码,并将它存储带D盘github.txt if response.status_code == 200: print(response.text) with open('D:/github.txt', 'w', encoding = 'utf-8') as f: f.write(response.text) else: print("Error!!!") if __name__ == "__main__": login = Login() login.login(email='1024593536@qq.com', password='password')#输入你自己的账户密码
可以改成网页形式查看