爬虫之链家网

链家网 爬取
如何查看头文件 看200

from lxml import etree
import requests
import csv
import time
# 定义爬取和解析数据的函数
#为了防止被服务器反爬虫禁止  所以定义头部
def spider():
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}
    pre_url='https://cc.lianjia.com/ershoufang/pg'
    #防止爬取速度过快
    i=0
    for x in range(1,11):
        html=requests.get(pre_url+str(x),headers=headers)
        html.encoding = 'utf-8'
        time.sleep(5)
        selector=etree.HTML(html.text)
        house_list=selector.xpath('//*[@id="content"]/div[1]/ul/li')
        for house in house_list:
            #apartment=house.xpath('div[2]/div[3]/div/a/text()')[0]
            apartment=house.xpath('div[1]/div[2]/div/a[1]/text()')[0]
            house_more_info=house.xpath('div[1]/div[3]/div/text()')[0]
            #print house_more_info  3室1厅 | 163.85平米 | 西南 | 精装 | 高楼层(共32层) | 2003年建 | 塔楼
            house_more_info_split=house_more_info.split(' | ')    # 0是
            house_layout=house_more_info_split[0]
            area=house_more_info_split[1]
            region=house_more_info_split[2]
            # print
            price=house.xpath('div[1]/div[6]/div[1]/span/text()')[0]+'万'
            item=[apartment,house_layout,area,region,price]
            data_write(item)
            apartment=house.xpath('div[1]/div[1]/a/text()')
            i=i+1
            print(i,'正在抓取',apartment)
def data_write(item):
        #防止添加空行  newline
    with open('ljian_ershoufang.csv','w',encoding='utf-8',newline='')as csvfile:
        writer=csv.writer(csvfile)
        writer.writerow(item)
spider()

posted @ 2020-01-28 10:48  Tony小哥  阅读(701)  评论(0编辑  收藏  举报