案例回顾

案例复习

豆瓣

import requests
import re
import time
from bs4 import BeautifulSoup
from openpyxl import Workbook

wb = Workbook()  # 打开表格
wb1 = wb.create_sheet('豆瓣表格', 0)  # 定义好工作簿
wb1.append(['电影名', '导演', '主演', '评分', '评价人数', '短评'])  # 插入表头


def movie_rank(n):
    url = 'https://movie.douban.com/top250'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'}
    res = requests.get(url,
                       headers=headers,
                       params={'start': n})
    # print(res.text)
    net = res.text
    soup = BeautifulSoup(net, 'lxml')
    title_list = re.findall('<img width="100" alt="(.*?)" src=', net)
    direct_list = re.findall('导演: (.*?)&nbsp;&nbsp;&nbsp;', net)
    actor_list = re.findall(' &nbsp;&nbsp;&nbsp;主演: (.*?) /...', net)
    score_list = re.findall('property="v:average">(.*?)</span>', net)
    comment_count_list = re.findall('<span>(.*?)人评价</span>', net)
    li_list = soup.select('ol.grid_view>li')
    inq_list = []
    for li in li_list:
        li = str(li)  # 前一次写在这里耽误好久 li直接拿到是没有type的
        if '<span class="inq">' in li:  # 如果包含在这个里面
            inq_part = re.findall('<span class="inq">(.*?)</span>', li)  # 有就可以正则拿到短评
            inq_list.append(inq_part[0])  # 拿到后取出元素写入
        else:
            inq_part = '等你评价'  # 没有就加上说明
            inq_list.append(inq_part)
    full_info = zip(title_list, direct_list, actor_list, score_list, comment_count_list, inq_list)  # 整合成一起
    for i in full_info:
        wb1.append(list(i))
    time.sleep(10)


for n in (0, 226, 25):
    movie_rank(n)

wb.save(r'豆瓣top250.xlsx')

猪八戒

import requests
from lxml import etree
from openpyxl import Workbook

wb = Workbook()
wb1 = wb.create_sheet('外包', 0)
wb1.append(['公司名称', '价格', '成交数', '业务范畴', '详情链接'])

work = input('请输入你想要的业务').strip()
res = requests.get('https://shanghai.zbj.com/search/f/',
                   params={'kw': work})
x_html = etree.HTML(res.text)
# 用全局查找需要将标签限制的很死  通过三个class限制住最初的div
company_tag_list = x_html.xpath('//div[contains(@class,"item-wrap") and contains(@class,"service-new") and contains(@class,"j-sp-item-wrap ")]/div[1]/div[1]/a[1]/div[1]/p/text()')
# print(company_tag_list)
company_list = []
for company_tag in company_tag_list:
    if len(company_tag)<5:
        continue
    else:
        company_list.append(company_tag.strip('\n'))
# print(company_list)
price_list = x_html.xpath('//div[contains(@class,"item-wrap") and contains(@class,"service-new") and contains(@class,"j-sp-item-wrap ")]/div[1]/div[1]/a[2]/div[2]/div[1]/span[1]/text()')
# print(price_list)
deal_num_list = x_html.xpath('//div[contains(@class,"item-wrap") and contains(@class,"service-new") and contains(@class,"j-sp-item-wrap ")]/div[1]/div[1]/a[2]/div[2]/div[1]/span[2]/text()')
item_include_list = x_html.xpath('//div[contains(@class,"item-wrap") and contains(@class,"service-new") and contains(@class,"j-sp-item-wrap ")]/div[1]/div[1]/a[2]/div[2]/div[2]/p[1]/text()')
link_list = x_html.xpath('//div[contains(@class,"item-wrap") and contains(@class,"service-new") and contains(@class,"j-sp-item-wrap ")]/div[1]/div[1]/a[2]//@href')
# # print(link_list)
full_info_list = zip(company_list, price_list, deal_num_list, item_include_list, link_list)
for full_info in full_info_list:
    # print(full_info)
    wb1.append(list(full_info))

wb.save(r'八戒八戒.xlsx')

城市

import requests
from lxml import etree

'''
1.首先查看网址不难发现数据是直接写在网页上的
2.城市部分被两大块div分割开来  那就将这两页分开爬取
'''
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36',
    'Referer': 'https://www.aqistudy.cn/historydata/'}
res = requests.get('https://www.aqistudy.cn/historydata/',
                   headers=headers)
# print(res.text)
etree = etree.HTML(res.text)
hot_city_list = etree.xpath('/html/body/div[3]/div/div[1]/div[1]/div[2]/ul/li/a/text()')
# print(hot_city_list)
hot_city_link_list = etree.xpath('/html/body/div[3]/div/div[1]/div[1]/div[2]/ul/li/a/@href')
# print(hot_city_link_list)
normal_city_list = etree.xpath('/html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li/a/text()')
# print(normal_city_list)
all_city = etree.xpath(
    '/html/body/div[3]/div/div[1]/div[1]/div[2]/ul/li/a/text()  |  /html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li/a/text()')
print(all_city)

百度登录

from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys


bro = webdriver.Chrome()  
bro.get('http://www.baidu.com') 
bro.implicitly_wait(10)
login = bro.find_element_by_id('s-top-loginbtn')
login.click() 
# 确认一下登录窗口是不是iframe加载 这里不是所以直接找就行
username = bro.find_element_by_id('TANGRAM__PSP_11__userName') 
password = bro.find_element_by_id('TANGRAM__PSP_11__password')
username.send_keys('13338888888')  
password.send_keys(input('输入密码')) 
time.sleep(5)
password.send_keys(Keys.ENTER)  
time.sleep(5)

京东

import time
from selenium import webdriver
from openpyxl import Workbook

bro = webdriver.Chrome()
bro.get('https://www.jd.com')
bro.implicitly_wait(10)
input_tag = bro.find_element_by_id('key')
goods = '塑料拖把'
input_tag.send_keys(goods)
search_tag = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
search_tag.click()
wb = Workbook()
wb1 = wb.create_sheet(goods, 0)
wb1.append(['图片链接', '详情', '价格'])


def get_info():
    for i in range(0, 12000, 1000):
        bro.execute_script('window.scrollTo(0,%s)' % i)
        time.sleep(0.3)
    good_list = bro.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li')
    for good_tag in good_list:
        img_link = good_tag.find_element_by_xpath('./div[1]/div[1]/a[1]/img').get_attribute('src')
        # print(img_link)
        if not img_link:
            img_link = 'https:' + good_tag.find_element_by_xpath('./div[1]/div[1]/a[1]/img').get_attribute(
                'data-lazy-img')
        # print(img_link)
        desc = good_tag.find_element_by_xpath('./div[1]/div[3]/a[1]/em').text
        # print(desc)
        price = good_tag.find_element_by_css_selector('div > div.p-price > strong > i').text
        wb1.append([img_link, desc, price])
        next_page_tag = bro.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[@class="pn-next"]')
        next_page_tag.click()
        # print(price)
        get_info()

try:
    get_info()
except BaseException as e:
    print('就到这里')
finally:
    wb.save('%s.xlsx' % goods)
    bro.close()

posted @ 2021-10-07 21:39 草卆鱼阅读(32) 评论(0) 编辑收藏举报

刷新页面返回顶部