爬取完会自动生成csv电子表格文件,含有房价、押付、链接等信息
环境
py2.7
pip install lxml
pip install cssselect

1 #coding:utf-8 2 import csv 3 import urllib2 4 import lxml.html 5 import time 6 import sys 7 from lxml.cssselect import CSSSelector 8 import threading 9 reload(sys) 10 sys.setdefaultencoding('utf8') 11 12 print "请输入要爬取得城市简称例如bj(北京):" 13 CITY=str(raw_input(">>>")) 14 def download(url, user_agent='Google', num_retries=2): 15 16 headers = {'User-agent': user_agent} 17 request = urllib2.Request(url, headers=headers) 18 try: 19 html = urllib2.urlopen(request).read() 20 except urllib2.URLError as e: 21 html = None 22 if num_retries > 0: 23 if hasattr(e, 'code') and 500 <= e.code < 600: 24 return download(url, num_retries-1) 25 return html 26 27 28 def get_data(url): 29 html_text_detail = download(url) 30 try: 31 tree = lxml.html.fromstring(html_text_detail) 32 house_ext = CSSSelector('div.house-pay-way > span:nth-child(3)') 33 house_title = CSSSelector('div.main-wrap > div.house-title > h1') 34 house_pay_way1 = CSSSelector('div.house-pay-way > span:nth-child(1)') 35 house_pay_way2 = CSSSelector('div.house-pay-way > span:nth-child(2)') 36 print house_title(tree)[0].text_content() 37 print '%s|%s' % (house_pay_way1(tree)[0].text_content(), house_pay_way2(tree)[0].text_content()) 38 39 for i in range(7): 40 for j in range(2): 41 css = 'div.house-desc-item > ul.f14 > li:nth-child(%s) > span:nth-child(%s)' % (i+1, j+1) 42 house_info = CSSSelector(css) 43 data = [ 44 ('标题 : ',house_title(tree)[0].text_content(), '#',url), 45 ('价格: ',house_pay_way1(tree)[0].text_content(), '#'), 46 ('压付: ',house_pay_way2(tree)[0].text_content(), '#'), 47 ('详情: ',house_info(tree)[0].text_content().replace(' ', ''), '#')] 48 with open('%s_houses.csv'%CITY,'ab+') as csvfile: 49 writer = csv.writer(csvfile,lineterminator='\n') 50 writer.writerows(data) 51 52 except TypeError as e: 53 pass 54 except IndexError as e: 55 pass 56 57 def get_url(html): 58 tree = lxml.html.fromstring(html) 59 sel = CSSSelector('div.mainbox > div.main > div.content > div.listBox > ul.listUl > li > div.des > h2 > a') 60 url_list = [] 61 for i in sel(tree): 62 if i.get('href') not in url_list: 63 url_list.append(i.get('href')) 64 return url_list 65 66 67 if __name__ == '__main__': 68 url_index = 'http://%s.58.com/chuzu/'%CITY 69 html_text_list = download(url_index) 70 url_list = get_url(html_text_list) 71 72 for url_detail in url_list: 73 thr = threading.Thread(target=get_data, args=(url_detail,)) 74 thr.start() 75 76 time.sleep(0.001)
------
往事如烟,伴着远去的步伐而愈加朦胧。未来似雾,和着前进的风儿而逐渐清晰!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列:基于图像分类模型对图像进行分类
· go语言实现终端里的倒计时
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· ollama系列01:轻松3步本地部署deepseek,普通电脑可用
· 25岁的心里话
· 按钮权限的设计及实现