去哪儿网北京当日酒店信息爬取

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

声明:仅学习参考

版本:verison_0

说明:主要是通过selenium拿到网页源码,然后通过lxml进行解析,大部分时间也花在解析网页源码提取数据上面和写逻辑上面了

技术:selenium,lxml,json

   在xpath中如果要提取子节点的所有文本信息,可以用 "li.xpath('string(xpath_path)')"

效果图:

 

 

源码:

from selenium import webdriver
import time
import re
import json
from lxml import etree
from urllib import parse
from pprint import pprint


class QuNaErSpider():
    """获取当日北京的酒店信息"""
    def __init__(self):
        self.driver = webdriver.Chrome()
    
    def save_info(self,content):
        with open("qunaer_hotel_today_info.json",'a+',encoding='utf-8') as f:
            f.write(json.dumps(obj=content,ensure_ascii=False,indent=4))
            f.write(",")
            print("写入完成")
    
    def parse_html(self,html_str,source_url):
        html_etree = etree.HTML(text=html_str)
        li_list = html_etree.xpath('//ul[contains(@id,"hotel_lst_body")]/li')
        current_page_info_list = list()
        for li in li_list:
            item = dict()
            hotel_name = li.xpath('.//div[@class="cont"]/p[@class="name"]/a/@title')
            item["hotel_name"] = hotel_name[0] if hotel_name else None
            hotel_href = li.xpath('.//div[@class="cont"]/p[@class="name"]/a/@href')
            item['totel_href'] = parse.urljoin(base=source_url,url=hotel_href[0]) if hotel_href else None
            hotel_type = li.xpath('//div[@class="cont"]/p[@class="name"]/span[last()]/text()')
            item['total_type'] = hotel_type[0] if hotel_type else None
            hotel_price = li.xpath('string(.//p[@class="price_new"])')
            item['hotel_price'] = hotel_price if hotel_price else None
            hotel_address = li.xpath('.//div[@class="cont"]/p[@class="adress"]/text()')
            item["hotel_address"] = hotel_address[0] if hotel_address else None
            hotel_comment = li.xpath('string(.//div[@class="cont"]/p[@class="comm"])')
            item['hotel_comment'] = hotel_comment if hotel_comment else None
            hotel_subject = li.xpath('string(.//div[@class="cont"]/div[@class="subj rmb"])')
            item['hotel_subject'] = hotel_subject if hotel_subject else None
            current_page_info_list.append(item)
        return current_page_info_list

    def into_first_page(self,driver,url=None):
        driver.get(url)
        hotel_element = driver.find_element_by_xpath('//div[contains(@class,"q_header_mnav")]/ul/li[3]')
        hotel_element.click()
        search_button = driver.find_element_by_xpath('//div[@class="G_searchIndex fl_left"]//div[@class="btn clearfix"]')
        search_button.click()
        time.sleep(1)
        return driver

    def run(self):
        root_url = "https://www.qunar.com/"
        driver = self.into_first_page(driver=self.driver,url=root_url)
        current_page_info_list = self.parse_html(html_str=driver.page_source,source_url=driver.current_url)
        self.save_info(current_page_info_list)
        nextpage_button = driver.find_element_by_xpath('//p[@class="next fl_right cur able"]')
        while nextpage_button:
                nextpage_button.click()
                time.sleep(1)
                current_page_info_list = self.parse_html(html_str=driver.page_source,source_url=driver.current_url)
                self.save_info(current_page_info_list)
                try:
                    nextpage_button = driver.find_element_by_xpath('//p[@class="next fl_right cur able"]')
                except Exception:
                    nextpage_button = None
        driver.quit()


if __name__ == "__main__":
    obj = QuNaErSpider()
    obj.run()

 

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

posted @ 2020-06-14 19:46  Norni  阅读(333)  评论(0编辑  收藏  举报