pip3 install requests
pip3 install beautifulsoup4
import requests from bs4 import BeautifulSoup # 1.下载页面 ret = requests.get(url="") ret.encoding = ret.apparent_encoding # 指定编码等于原始页面编码 # 2. 解析:获取想要的指定内容 beautifulsoup soup = BeautifulSoup(ret.text, 'html.parser') # 使用lxml则速度更快 # 如果要加class,则前面加下划线 # 使用属性字典方式 div = soup.find(name='div', attrs={"id":"auto-channel-lazyload-article","class":"article-wrapper"}) li_list = div.find_all(name='li') with open('res.txt','w',encoding='utf-8') as t: for li in li_list: h3 = li.find(name='h3') if not h3: continue t.write(h3.text+'\n') a = li.find('a') t.write(a.get('href')+'\n') p = li.find(name='p') txt = p.text.split(' ',1)[1] t.write(txt+'\n') t.write('\n') img = li.find(name='img') src = img.get('src') filename = src.rsplit('__', maxsplit=1)[1] down_img = requests.get(url='https:' + src) with open('./img/'+filename, 'wb') as f: f.write(down_img.content)
#!/usr/bin/env python # coding:utf-8 import requests # 请求头要加,先访问普通网页,伪造得越像浏览器越好 # 1. 先访问网页,获取cookie(未授权) ret = requests.get( url="", headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', } ) # print(ret.text) r1_cookie_dict = ret.cookies.get_dict() # 2. 登录 发送用户名和密码认证, 带上未授权的cookie # 需要注意反爬虫策略 response_login = url="", data={ "phone": "8618912600100", "password": "wodemima", "oneMonth": "1" }, headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' }, cookies=r1_cookie_dict ) # print(response_login.text) # cookie_dict=response_login.cookies.get_dict() # 第二次返回的cookie # 点赞 r1 = url="", headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}, cookies=r1_cookie_dict ) print(r1.text) # {"result":{"code":"9999", "message":"推荐成功", "data":{"jid":"cdu_53074732774","likedTime":"1530752755154000","lvCount":"21","nick":"aabbccdd","uvCount":"1","voteTime":"小于1分钟前"}}}
#!/usr/bin/env python # coding:utf-8 import requests,re from bs4 import BeautifulSoup ''' requests.get(url="") # requests.request(method="get",url="xxx")"") # requests.request(method="post",url="xxx") 可以传的参数: url: 地址 params: URL中传入的参数 headers: 请求头 cookies: Cookie data: 数据 以上必需牢记 ''' ret = requests.get( url="", params={"wd": "王历宏"}, # headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', }, ) ret.encoding = ret.apparent_encoding # print(ret.text) soup = BeautifulSoup(ret.text, 'html.parser') div = soup.find(name='span', attrs={"class":"nums_text"}) # lis = re.findall("\d+",div.text) # print("".join(lis)) print(div.text) ''' ### json参数 url="", # json={ # 'name':'alex', # 'passwd':'123456', # }, headers={}, cookies={}, # 如果搞不清对方是要Form_data 还是payload 就使用下面的方式。 data=json_dumps({ 'name':'alex', 'pwd':'123456', }) ) ''' ## 上传文件 # auth 基本弹窗验证 from requests.auth import HTTPBasicAuth,HTTPDigestAuth res = requests.get( '', auth=HTTPBasicAuth("","11223344") # github不能直接使用密码了,需要使用github生成的token # '', auth=HTTPDigestAuth("","11223344") # 方法不一样 ) print(res.text) # timeout 超时时间 # allow_redirects ## proxies 代理 ''' proxies ={ "http":"", "https":"", } ret = requests.get("",proxies=proxies) proxies2 = {"":""} ''' # 使用代理字典,以及用户名密码 ''' from requests.auth import HTTPProxyAuth proxy_dict={ 'http':'', 'https':'' } auth=HTTPProxyAuth('username','mypwd') r = requests.get("",proxies=proxy_dict,auth=auth) '''
#!/usr/bin/env python # coding:utf-8 import requests from bs4 import BeautifulSoup username = input("请输入github账号:") pwd = input("请输入github密码:") print("请稍等几秒... ") # 1. 打开登录页 ret1 = requests.get( url="", headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', } ) r1_cookie_dict = ret1.cookies.get_dict() # 首次获取cookie soup1 = BeautifulSoup(ret1.text, features='lxml') token1 = soup1.find(name="input", attrs={"name": "authenticity_token"}).get("value") # 拿到页面token # print(token1) # 是否取到 authenticity_token # 2. 登录动作 ret2 = url="", data={ "commit": "Sign in", "utf8": "✓", "authenticity_token": token1, "login": username, "password": pwd, }, headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', }, cookies=r1_cookie_dict # 带上首次的cookie ) r2_cookie_dict = ret2.cookies.get_dict() # 获取登录成功后返回的cookie # print(ret2.text) # 确实是慢了点 # 3. 作业中要求获取个人信息,所以打开个人settings页 ret3 = requests.get( url="", headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', }, cookies=r2_cookie_dict # 带上登录成功后的cookie ) # print(ret3.text) # 4. 查找并打印个人信息 soup3 = BeautifulSoup(ret3.text, features='lxml') user_info_name= soup3.find(name="input", attrs={"name": "user[profile_name]"}).get("value") user_info_email = soup3.find(name="select", attrs={"name": "user[profile_email]"}).get("option") # 可能有问题 user_info_bio = soup3.find(name="textarea", attrs={"name": "user[profile_bio]"}).get("value") user_info_url = soup3.find(name="input", attrs={"name": "user[profile_blog]"}).get("value") user_info_company = soup3.find(name="input", attrs={"name": "user[profile_company]"}).get("value") user_info_location = soup3.find(name="input", attrs={"name": "user[profile_location]"}).get("value") print('Name: ',user_info_name) print('Public email: ',user_info_email) print('Bio: ',user_info_bio) print('URL: ',user_info_url) print('Company: ',user_info_company) print('Location: ',user_info_location) ''' 以下是API的方式,试过,直接得到字典。 from requests.auth import HTTPBasicAuth res = requests.get( '', auth=HTTPBasicAuth(username, pwd) ) print(res.text) '''
1.请了解下python的pep8规范 2.你的请求头一定要写完整,不要这么暴露你的爬虫请求,这种行为是不好的习惯。 3.你代码的注释写在文档里最好了。 4.你每个请求一定要try一下这在爬虫里很重要你要保证你的爬虫稳定运行 5.你的代码应该封装成函数 6.你写任何项目的时候注意下项目结构哈 7.同学作业写的很好了,其实生产中bs4还是不多的。pyquery或者路径获取的方式用的很多。
# 安装 pip install you-get # 使用,命令行下: you-get