房天下爬虫
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 import requests 4 from bs4 import BeautifulSoup 5 import pandas 6 def gethousedetail(url): 7 info ={} 8 res = requests.get(url) 9 soup = BeautifulSoup(res.text,'html.parser') 10 info['title']=soup.select('title')[0].text.strip() 11 info['price']=soup.select('.zongjia1 .red20b')[0].text.strip() 12 for dd in soup.select('dd'): 13 if ':' in dd.text.strip(): 14 k,v = dd.text.strip().split(':') 15 info[k]=v 16 return info 17 res = requests.get('http://esf.sh.fang.com/') 18 domain = 'http://esf.sh.fang.com' 19 soup = BeautifulSoup(res.text,'html.parser') 20 houseary = [] 21 for house in soup.select('.houseList dl'): 22 urls = domain + house.select('.title a')[0]['href'] 23 houseary.append(gethousedetail(urls)) 24 25 df =pandas.DataFrame(houseary) 26 df.to_excel('house.xlsx',index=False)