案例复习
豆瓣
import requests
import re
import time
from bs4 import BeautifulSoup
from openpyxl import Workbook
wb = Workbook() # 打开表格
wb1 = wb.create_sheet('豆瓣表格', 0) # 定义好工作簿
wb1.append(['电影名', '导演', '主演', '评分', '评价人数', '短评']) # 插入表头
def movie_rank(n):
url = 'https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'}
res = requests.get(url,
headers=headers,
params={'start': n})
# print(res.text)
net = res.text
soup = BeautifulSoup(net, 'lxml')
title_list = re.findall('<img width="100" alt="(.*?)" src=', net)
direct_list = re.findall('导演: (.*?) ', net)
actor_list = re.findall(' 主演: (.*?) /...', net)
score_list = re.findall('property="v:average">(.*?)</span>', net)
comment_count_list = re.findall('<span>(.*?)人评价</span>', net)
li_list = soup.select('ol.grid_view>li')
inq_list = []
for li in li_list:
li = str(li) # 前一次写在这里耽误好久 li直接拿到是没有type的
if '<span class="inq">' in li: # 如果包含在这个里面
inq_part = re.findall('<span class="inq">(.*?)</span>', li) # 有就可以正则拿到短评
inq_list.append(inq_part[0]) # 拿到后取出元素写入
else:
inq_part = '等你评价' # 没有就加上说明
inq_list.append(inq_part)
full_info = zip(title_list, direct_list, actor_list, score_list, comment_count_list, inq_list) # 整合成一起
for i in full_info:
wb1.append(list(i))
time.sleep(10)
for n in (0, 226, 25):
movie_rank(n)
wb.save(r'豆瓣top250.xlsx')
猪八戒
import requests
from lxml import etree
from openpyxl import Workbook
wb = Workbook()
wb1 = wb.create_sheet('外包', 0)
wb1.append(['公司名称', '价格', '成交数', '业务范畴', '详情链接'])
work = input('请输入你想要的业务').strip()
res = requests.get('https://shanghai.zbj.com/search/f/',
params={'kw': work})
x_html = etree.HTML(res.text)
# 用全局查找需要将标签限制的很死 通过三个class限制住最初的div
company_tag_list = x_html.xpath('//div[contains(@class,"item-wrap") and contains(@class,"service-new") and contains(@class,"j-sp-item-wrap ")]/div[1]/div[1]/a[1]/div[1]/p/text()')
# print(company_tag_list)
company_list = []
for company_tag in company_tag_list:
if len(company_tag)<5:
continue
else:
company_list.append(company_tag.strip('\n'))
# print(company_list)
price_list = x_html.xpath('//div[contains(@class,"item-wrap") and contains(@class,"service-new") and contains(@class,"j-sp-item-wrap ")]/div[1]/div[1]/a[2]/div[2]/div[1]/span[1]/text()')
# print(price_list)
deal_num_list = x_html.xpath('//div[contains(@class,"item-wrap") and contains(@class,"service-new") and contains(@class,"j-sp-item-wrap ")]/div[1]/div[1]/a[2]/div[2]/div[1]/span[2]/text()')
item_include_list = x_html.xpath('//div[contains(@class,"item-wrap") and contains(@class,"service-new") and contains(@class,"j-sp-item-wrap ")]/div[1]/div[1]/a[2]/div[2]/div[2]/p[1]/text()')
link_list = x_html.xpath('//div[contains(@class,"item-wrap") and contains(@class,"service-new") and contains(@class,"j-sp-item-wrap ")]/div[1]/div[1]/a[2]//@href')
# # print(link_list)
full_info_list = zip(company_list, price_list, deal_num_list, item_include_list, link_list)
for full_info in full_info_list:
# print(full_info)
wb1.append(list(full_info))
wb.save(r'八戒八戒.xlsx')
城市
import requests
from lxml import etree
'''
1.首先查看网址不难发现数据是直接写在网页上的
2.城市部分被两大块div分割开来 那就将这两页分开爬取
'''
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36',
'Referer': 'https://www.aqistudy.cn/historydata/'}
res = requests.get('https://www.aqistudy.cn/historydata/',
headers=headers)
# print(res.text)
etree = etree.HTML(res.text)
hot_city_list = etree.xpath('/html/body/div[3]/div/div[1]/div[1]/div[2]/ul/li/a/text()')
# print(hot_city_list)
hot_city_link_list = etree.xpath('/html/body/div[3]/div/div[1]/div[1]/div[2]/ul/li/a/@href')
# print(hot_city_link_list)
normal_city_list = etree.xpath('/html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li/a/text()')
# print(normal_city_list)
all_city = etree.xpath(
'/html/body/div[3]/div/div[1]/div[1]/div[2]/ul/li/a/text() | /html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li/a/text()')
print(all_city)
百度登录
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys
bro = webdriver.Chrome()
bro.get('http://www.baidu.com')
bro.implicitly_wait(10)
login = bro.find_element_by_id('s-top-loginbtn')
login.click()
# 确认一下登录窗口是不是iframe加载 这里不是所以直接找就行
username = bro.find_element_by_id('TANGRAM__PSP_11__userName')
password = bro.find_element_by_id('TANGRAM__PSP_11__password')
username.send_keys('13338888888')
password.send_keys(input('输入密码'))
time.sleep(5)
password.send_keys(Keys.ENTER)
time.sleep(5)
京东
import time
from selenium import webdriver
from openpyxl import Workbook
bro = webdriver.Chrome()
bro.get('https://www.jd.com')
bro.implicitly_wait(10)
input_tag = bro.find_element_by_id('key')
goods = '塑料拖把'
input_tag.send_keys(goods)
search_tag = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
search_tag.click()
wb = Workbook()
wb1 = wb.create_sheet(goods, 0)
wb1.append(['图片链接', '详情', '价格'])
def get_info():
for i in range(0, 12000, 1000):
bro.execute_script('window.scrollTo(0,%s)' % i)
time.sleep(0.3)
good_list = bro.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li')
for good_tag in good_list:
img_link = good_tag.find_element_by_xpath('./div[1]/div[1]/a[1]/img').get_attribute('src')
# print(img_link)
if not img_link:
img_link = 'https:' + good_tag.find_element_by_xpath('./div[1]/div[1]/a[1]/img').get_attribute(
'data-lazy-img')
# print(img_link)
desc = good_tag.find_element_by_xpath('./div[1]/div[3]/a[1]/em').text
# print(desc)
price = good_tag.find_element_by_css_selector('div > div.p-price > strong > i').text
wb1.append([img_link, desc, price])
next_page_tag = bro.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[@class="pn-next"]')
next_page_tag.click()
# print(price)
get_info()
try:
get_info()
except BaseException as e:
print('就到这里')
finally:
wb.save('%s.xlsx' % goods)
bro.close()