爬虫实战
爬取拉勾网职位信息
#https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false import requests #实际要爬取的url url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false' payload = { 'first': 'true', 'pn': '1', 'kd': 'python', } header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36', 'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=', 'Accept': 'application/json, text/javascript, */*; q=0.01' } #原始的url urls ='https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=' #建立session s = requests.Session() # 获取搜索页的cookies s.get(urls, headers=header, timeout=3) # 为此次获取的cookies cookie = s.cookies # 获取此次文本 response = s.post(url, data=payload, headers=header, cookies=cookie, timeout=5).text print(response)
爬取红楼梦小说
#http://www.shicimingju.com/book/hongloumeng.html import requests from bs4 import BeautifulSoup ret=requests.get('https://www.shicimingju.com/book/hongloumeng.html') # print(ret.text) soup=BeautifulSoup(ret.text,'lxml') li_list=soup.find(class_='book-mulu').find('ul').find_all('li') with open('hlm.txt','w',encoding='utf-8') as f: for li in li_list: content=li.find('a').text url='https://www.shicimingju.com'+li.find('a').get('href') f.write(content) f.write('\n') res_content=requests.get(url) soup2=BeautifulSoup(res_content.text,'lxml') content_detail=soup2.find(class_='chapter_content').text f.write(content_detail) f.write('\n') print(content,'写入了')
爬取肯德基门店
# http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword import requests header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36' } data = { 'cname': '', 'pid': 20, 'keyword': '浦东', 'pageIndex': 1, 'pageSize': 10 } ret = requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword', data=data, headers=header) print(ret.json())
爬取糗事百科段子
#https://www.qiushibaike.com/text/page/2/ import requests from bs4 import BeautifulSoup ret=requests.get('https://www.qiushibaike.com/text/page/2/') # print(ret.text) soup=BeautifulSoup(ret.text,'html.parser') article_list=soup.find_all(class_='article') # print(article_list) for article in article_list: content=article.find(class_='content').text print(content) print('-------')
爬取京东商品信息
from selenium import webdriver import time # 模拟键盘输入 from selenium.webdriver.common.keys import Keys bro=webdriver.Chrome(executable_path='./chromedriver.exe') # 设置隐士等待 bro.implicitly_wait(10) def get_goods_info(bro): # li_list=bro.find_element_by_class_name('gl-warp').find_elements_by_tag_name('li') # goods=bro.find_elements_by_class_name('gl-item') goods = bro.find_elements_by_css_selector('.gl-item') # print(len(goods)) for good in goods: try: price = good.find_element_by_css_selector('.p-price i').text name = good.find_element_by_css_selector('.p-name em').text url = good.find_element_by_css_selector('.p-img a').get_attribute('href') commits = good.find_element_by_css_selector('.p-commit strong>a').text photo_url = good.find_element_by_css_selector('.p-img img').get_attribute('src') print(''' 商品名字:%s 商品价格:%s 商品地址:%s 商品评论数:%s 商品图片地址:%s ''' % (name, price, url, commits, photo_url)) except Exception as e: continue next_button = bro.find_element_by_partial_link_text('下一页') time.sleep(1) next_button.click() get_goods_info(bro) try: bro.get('https://www.jd.com/') input_k=bro.find_element_by_id('key') input_k.send_keys('奶牛') # 模拟键盘的回车键 input_k.send_keys(Keys.ENTER) get_goods_info(bro) except Exception as e: print(e) finally: bro.close()
自动登录12306
from selenium import webdriver import time #pillow from PIL import Image # 引入超级鹰 from chaojiying import Chaojiying_Client from selenium.webdriver import ActionChains bro=webdriver.Chrome(executable_path='./chromedriver.exe') bro.implicitly_wait(10) try: bro.get('https://kyfw.12306.cn/otn/resources/login.html') bro.maximize_window() # 窗口最大化,全屏 button_z=bro.find_element_by_css_selector('.login-hd-account a') button_z.click() time.sleep(2) # 截取整个屏幕 bro.save_screenshot('./main.png') # 验证码的位置和大小 img_t=bro.find_element_by_id('J-loginImg') print(img_t.size) print(img_t.location) size=img_t.size location=img_t.location img_tu = (int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height'])) # # 抠出验证码 # #打开 img = Image.open('./main.png') # 抠图 fram = img.crop(img_tu) # 截出来的小图 fram.save('code.png') # 调用超级鹰破解 chaojiying = Chaojiying_Client('306334678', 'lqz12345', '903641') #用户中心>>软件ID 生成一个替换 96001 im = open('code.png', 'rb').read() #本地图片文件路径 来替换 a.jpg 有时WIN系统须要// # print(chaojiying.PostPic(im, 9004)) ## 返回结果如果有多个 260,133|123,233,处理这种格式[[260,133],[123,233]] res=chaojiying.PostPic(im, 9004) print(res) result=res['pic_str'] all_list = [] if '|' in result: list_1 = result.split('|') count_1 = len(list_1) for i in range(count_1): xy_list = [] x = int(list_1[i].split(',')[0]) y = int(list_1[i].split(',')[1]) xy_list.append(x) xy_list.append(y) all_list.append(xy_list) else: x = int(result.split(',')[0]) y = int(result.split(',')[1]) xy_list = [] xy_list.append(x) xy_list.append(y) all_list.append(xy_list) print(all_list) # 用动作链,点击图片 # [[260,133],[123,233]] for a in all_list: x = a[0] y = a[1] ActionChains(bro).move_to_element_with_offset(img_t, x, y).click().perform() time.sleep(1) username=bro.find_element_by_id('J-userName') username.send_keys('306334678') password=bro.find_element_by_id('J-password') password.send_keys('lqz12345') time.sleep(3) submit_login=bro.find_element_by_id('J-login') submit_login.click() time.sleep(3) print(bro.get_cookies()) time.sleep(10) bro.get('https://www.12306.cn/index/') time.sleep(5) except Exception as e: print(e) finally: bro.close()