需要模拟点击爬虫小案例

用了一天时间，写了一个简单功能的小爬虫，用的selenium
爬虫主要涉及：
1、模拟用户登录
2、模拟点击下拉菜单
3、定位下拉菜单中的文本、点击选择
4、双击文本，实现隐藏文本==>显示文本
5、查询后先切换到首页
6、点击下一页翻页，判断当前页是否是最

from selenium import webdriver
from selenium.webdriver import ActionChains
import re
import time
import os
import random

'''
爬虫函数，输入省份、城市、参数
该函数主要完成省份、城市参数传入
driver.execute_script("arguments[0].scrollIntoView();", lis_p[p])完成下拉菜单找到省份及城市所在位置
点击省份、城市下拉菜单，点击查询，等待加载
先跳转到首页，然后按页循环获取table中的文本内容
'''
def paqu(driver,p,c,lis_p,lis_c,p_name):
    driver.find_element_by_xpath(
        '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[11]/div/div[2]/div[1]/span/span/i').click()

    time.sleep(5)

    driver.execute_script("arguments[0].scrollIntoView();", lis_p[p])
    lis_p[p].click()
    time.sleep(5)

    # 城市
    driver.find_element_by_xpath(
        '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[12]/div/div[2]/div[1]/span/span/i').click()
    time.sleep(3)
    driver.execute_script("arguments[0].scrollIntoView();", lis_c[c])
    c_name = lis_c[c].text
    lis_c[c].click()
    time.sleep(3)

    # 查询
    driver.find_element_by_xpath('//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[20]/div/button/span').click()
    time.sleep(5)
    try:
        # 跳到首页
        driver.find_element_by_xpath(
            '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div[3]/section/section[2]/span[1]').click()
        time.sleep(5)
        # 获取一共有多少页
        ul_page  = driver.find_element_by_xpath('//*[@id="app"]/div/section/div/section[2]/div/div[2]/div[3]/section/section[2]/div[2]/ul')
        li_page = ul_page.find_elements_by_xpath('li')
        pages = int(li_page[len(li_page)-1].text)
        for i in range(1,pages+1):
            print(i)
            # 客户名称
            try:
                action_chains = ActionChains(driver)

                v_num = driver.find_elements_by_xpath(
                    '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div[3]/section/section[1]/div/div[3]/table/tbody/tr/td[3]/div/span')
                list_num = []
                for n in range(len(v_num)):
                    list_num.append(v_num[n].text)

                spans = driver.find_elements_by_xpath(
                    '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div[3]/section/section[1]/div/div[3]/table/tbody/tr/td[2]/div/span')

                for i in range(len(spans)):
                    action_chains.double_click(spans[i]).perform()
                    new_time = random.randint(2,5)
                    time.sleep(new_time)
                    print(p_name+' '+c_name+' '+spans[i].text+' '+list_num[i])

                    f.write(p_name+' '+c_name+' '+spans[i].text+' '+list_num[i])
                    f.write('\n')

                but = driver.find_element_by_xpath('//*[@id="app"]/div/section/div/section[2]/div/div[2]/div[3]/section/section[2]/div[2]/button[2]')
                flag = but.is_enabled()
                print(flag)
                if flag is True:
                    but.click()
                    time.sleep(5)
                else:
                    print('当前是最后一页')
            except:
                print('查询结果为空')
        driver.find_element_by_xpath(
            '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div[3]/section/section[2]/span[1]').click()
        time.sleep(5)
    except:
        pass

'''
点击下拉菜单，获取省份下有多少城市
'''
def get_citynum(driver,p):
    driver.find_element_by_xpath(
        '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[11]/div/div[2]/div[1]/span/span/i').click()
    time.sleep(5)
    ul_p = driver.find_element_by_xpath(
        '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[11]/div/div[2]/div[2]/div[1]/div[1]/ul')
    lis_p = ul_p.find_elements_by_xpath('li')
    driver.execute_script("arguments[0].scrollIntoView();", lis_p[p])
    p_name = lis_p[p].text
    lis_p[p].click()
    time.sleep(5)

    # 城市
    driver.find_element_by_xpath(
        '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[12]/div/div[2]/div[1]/span/span/i').click()
    time.sleep(5)
    ul_c = driver.find_element_by_xpath(
        '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[12]/div/div[2]/div[2]/div[1]/div[1]/ul')
    lis_c = ul_c.find_elements_by_xpath('li')
    num = len(lis_c)
    return num,lis_p,lis_c,p_name


if __name__ == "__main__":
    # 打开Firefox浏览器 设定等待加载时间
    driver = webdriver.Chrome()
    # 定位节点
    url = '******'
    driver.get(url)
    time.sleep(5)
    # 输入用户名
    driver.find_element_by_xpath('//*[@id="app"]/div/div/form/div[1]/div/label/div/input').clear()
    driver.find_element_by_xpath('//*[@id="app"]/div/div/form/div[1]/div/label/div/input').send_keys('******')
    # 输入登录的密码
    driver.find_element_by_xpath('//*[@id="app"]/div/div/form/div[2]/div/label/div/input').clear()
    time.sleep(2)
    driver.find_element_by_xpath('//*[@id="app"]/div/div/form/div[2]/div/label/div/input').send_keys('******')
    # 输入验证码
    driver.find_element_by_xpath('//*[@id="app"]/div/div/form/div[3]/div/div/label/div/input').send_keys(
        input("输入验证码： "))
    # 点击登录
    driver.find_element_by_xpath('//*[@id="app"]/div/div/form/label[1]/button').click()
    time.sleep(5)
    # 客户分析
    driver.find_element_by_xpath('//*[@id="app"]/div/header/nav/div[8]').click()
    time.sleep(5)
    # 潜在车辆销售分析
    driver.find_element_by_xpath('//*[@id="app"]/div/section/div/section[1]/div/div[2]/div/ul/li[5]').click()
    time.sleep(5)
    for p in range(1,2):
        f = open(r'E:\MyPythonProject\xiaoshou_kehu\Vehicle_distribution'+str(p+1)+'.txt', 'w')
        print('开始爬取第'+str(p+1)+'个省份')
        num,lis_p,lis_c,p_name = get_citynum(driver,p)
        for c in range(4,num):
            paqu(driver, p,c,lis_p,lis_c,p_name)
        print('结束爬取第'+str(p+1)+'个省份')
        f.close()

自己写的爬虫太简单，需要进一步精进修改，没有深入的去

思考爬虫设计的够不够完善，用户代理、访问频次等等都没有仔细考虑。
先记录一下本次爬虫小case，有时间的话会进行完善

posted @ 2020-06-17 15:05 自由的射手阅读(791) 评论(0) 收藏举报

刷新页面返回顶部

自由的射手

需要模拟点击爬虫小案例

公告