Python 爬虫实例（12）—— python selenium 爬虫


# coding:utf-8
from common.contest import *


def spider():

　　url = "http://www.salamoyua.com/es/subasta.aspx?origen=subastas&subasta=79"

 　　chromedriver = 'C:/Users/xuchunlin/AppData/Local/Google/Chrome/Application/chromedriver.exe'
    chome_options = webdriver.ChromeOptions()
　　　
　　#使用代理　
    # proxies = r.get('4')
    # chome_options.add_argument(('--proxy-server=http://' + proxies))

    os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Chrome(chromedriver, chrome_options=chome_options)

    for i in range(1,100):

       print "正在爬取第" + str(i) + "页的数据"

       if i ==1:
           # 请求url
           driver.get(session_url)
           result = driver.page_source
       else:
          try:
             # 将页面滚动条拖到底部
             js = "var q=document.documentElement.scrollTop=10000"
             driver.execute_script(js)
             driver.find_element_by_id('ctl00_phContenidos_lbSiguiente').click()
                    
             # 得到爬取页面的结果
             result = driver.page_source
             time.sleep(3)
           except:
               result = ""

      soup = BeautifulSoup(result, 'html.parser')
      result_div = soup.find_all('figure', attrs={"class": "Lotes fade"})
      # print len(result_div)
      for i in result_div:

　　　　　　　　　result_replace = replace(i)
                print result_replace

                item_url = re.findall('<figure class="Lotes fade"><a href="(.*?)" id=',result_replace)[0]
                item_url = "http://www.salamoyua.com/es/" + item_url.replace('','')

                item_imgurl = re.findall('<img id=".*?" src="..(.*?)" style="border-width:0px', result_replace)[0]
                item_imgurl = "http://www.salamoyua.com" + item_imgurl.replace('', '')

                if "Remate" not in result_replace:
                    sold_price = ""
                else:
                    sold_price = re.findall('<p><strong>Remate:(.*?)</strong></p></figcaption>', result_replace)[0]
                    sold_price = sold_price.replace(' ','')

                try:

                    item_lotnum = re.findall('title="Lote vendido"><span id=".*?">(.*?)</span>', result_replace)[0]
                    item_lotnum = item_lotnum.replace('Lote','').replace(' ','')
                except:
                    item_lotnum = re.findall('<span id=".*?">(.*?)</span></header>',result_replace)[0]
                    item_lotnum = item_lotnum.replace('Lote', '').replace(' ', '')

                print item_url
                print item_lotnum
                print item_imgurl
                print sold_price

　　



   



spider()

posted @ 2018-02-11 14:43 淋哥阅读(2320) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

阅读排行：
· 10年+ .NET Coder 心语 ── 封装的思维：从隐藏、稳定开始理解其本质意义
· 地球OL攻略 —— 某应届生求职总结
· 周边上新：园子的第一款马克杯温暖上架
· Open-Sora 2.0 重磅开源！
· 提示词工程——AI应用必不可少的技术

公告

昵称：淋哥
园龄： 8年10个月
粉丝： 229
关注： 0

+加关注

2025年3月

日

一

二

三

四

五

六

英雄莫问出处,富贵当思缘由

Python 爬虫实例（12）—— python selenium 爬虫

公告

搜索

常用链接

最新随笔

我的标签

积分与排名

随笔分类 (338)

随笔档案 (452)

文章分类 (6)

文章档案 (19)

阅读排行榜

评论排行榜

推荐排行榜

最新评论