lxml与bs4 select、58爬虫实例

  • 基础班

    #coding=utf8
    from __future__ import unicode_literals
    from bs4 import BeautifulSoup
    import requests
    
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36',
        'Cookie':''
    } 
    url='http://www.tripadvisor.cn/Hotels-g60763-New_York_City_New_York-Hotels.html'
    
    wb_data=requests.get(url,headers=headers)
    soup=BeautifulSoup(wb_data.content,'lxml')
    titles=soup.select('a.property_title')
    imgs=soup.select('img[height="200"]')//通过标签的属性height=200获取所有高为200的图片,也可以用class属性获取,如imgs=soup.select('img.photo_image')
    cates=soup.select('div.clickable_tags')
    for title,img,cate in zip(titles,imgs,cates):
        data={
            'title':title.get_text(),//获取标题文本
            'img':img.get('src'),//通过src获取图片
            'cate':cate.get_text(',')//获取文本,中间用“,”隔开
        }
        print(data)
    
  • 分类实现:

    from bs4 import BeautifulSoup
    import requests
    
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36',
        'Cookie':''
    }
    url_saves='http://www.tripadvisor.cn/Hotels-g60763-New_York_City_New_York-Hotels.html'
    
    def get_attractions(url,data=None):
        wb_data=requests.get(url_saves)
        soup=BeautifulSoup(wb_data.text,'lxml')
        soup.decode('utf-8')
        titles=soup.select('a.property_title')
        imgs=soup.select('img.photo_image')
        mates=soup.select('div.clickable_tags')
    
        for title,img,mate in zip(titles,imgs,mates):
            data={
                'title':title.get_text(),
                'img':img.get('src'),
                'mate':mate.get_text()
            }
            print(data)
    
    def get_favs(url,data=None):
        wb_data=requests.get(url,headers=headers)
        soup=BeautifulSoup(wb_data.text,'lxml')
        titles=soup.select('a.property_title')
        imgs = soup.select('img.photo_image')
        mates = soup.select('div.clickable_tags')
    
        if data==None:
            for title, img, mate in zip(titles, imgs, mates):
                data = {
                    'title': title.get_text(),
                    'img': img.get('src'),
                    'mate': mate.get_text()
                }
                print(data)
    
    
    
    get_favs(url_saves)
    
  • 对于用链接控制页数的网页,获取所有页面的方法:
    urls=['http://tieba.baidu.com/p/4900705159?pn={}',format(str(i)) for i in range(1,6,1)]

    	for single_url in urls:
    	get_attractions(single_url)//通过循环调用上面代码中的类获取每一页的信息
    
  • 对于用js处理的图片,可以设置User-Agent模拟手机客户端登陆,然后获取图片

  • 58同城实例

	from bs4 import BeautifulSoup
	import requests
	import time
	url='http://bj.58.com/pingbandiannao/24604629984324x.shtml'
	wb_data = requests.get(url)
	soup = BeautifulSoup(wb_data.text,'lxml')
	def get_links_from(who_sells):
	    urls = []
	    list_view = 'http://bj.58.com/pbdn/{}/pn2/'.format(str(who_sells))
	    wb_data = requests.get(list_view)
	    soup = BeautifulSoup(wb_data.text,'lxml')
	    for link in soup.select('td.t a.t'): //查找td标签下的t和a标签下的t
	        urls.append(link.get('href').split('?')[0])
	    return urls
	def get_views_from(url):
	    id = url.split('/')[-1].strip('x.shtml')  //以/分割链接,然后用-1取最后一个,用strip('x,shtml')在取到的数据后追加x.html
	    api = 'http://jst1.58.com/counter?infoid={}'.format(id)//上面的id是动态的,所以用.format加入动态id
	    //这个是找到了58的查询接口,不了解接口可以参照一下新浪微博接口的介绍
	    js = requests.get(api)
	    views = js.text.split('=')[-1] // 找到的id用=号分割,然后返回最后一个
	    return views
	def get_item_info(who_sells=0):
	    urls = get_links_from(who_sells)
	    for url in urls:
	        wb_data = requests.get(url)
	        soup = BeautifulSoup(wb_data.text,'lxml')
	        data = {
	            'title':soup.title.text,//直接获取标题文本
	            'price':soup.select('.price')[0].text,
	            #获取地区,对于没有填写地区的设置为None,使用 .stripped_strings 可以去除多余空格或空行内容
	            'area' :list(soup.select('.c_25d')[0].stripped_strings) if soup.find_all('span','c_25d') else None,
	            'date' :soup.select('.time')[0].text,
	            'cate' :'个人' if who_sells == 0 else '商家',
	            # 'views':get_views_from(url)
	        }
	        print(data)
	get_item_info(url)
	get_links_from(1)
	get_item_info()
  • requests自带status检测函数,用于判断页面是否存在
wb_data=requests.get(url)
if wb_data.status_code==404:  //用requests的status_code获取页面的状态码,判断是否为404
    pass
  • 线程池
# http://cn-proxy.com/
proxy_list = [
    'http://117.177.250.151:8081',
    'http://111.85.219.250:3129',
    'http://122.70.183.138:8118',
    ]
proxy_ip = random.choice(proxy_list) # 随机获取代理ip
proxies = {'http': proxy_ip}
  • 代码解析方法
        data = {
            'title':soup.title.text.strip(),
            'price':soup.select('.f22.fc-orange.f-type')[0].text.strip(),
            'pub_date':soup.select('.pr-5')[0].text.strip().split(' ')[0],

            //先用map把数据装入map中,map中lambda表达式定义x,用x的text方法迭代后面soup.select()获取到的数据,从而取出文本,map返回的是一个地址,用list把map转换为值
            'area':list(map(lambda x:x.text,soup.select('ul.det-infor > li:nth-of-type(3) > a'))),
            'cates':list(soup.select('ul.det-infor > li:nth-of-type(1) > span')[0].stripped_strings),
            'url':url
        }
  • 用该方法把url去重
db_urls = [item['url'] for item in url_list.find()]
index_urls = [item['url'] for item in item_info.find()]
x = set(db_urls)
y = set(index_urls)
rest_of_urls = x-y
posted @ 2016-12-22 12:50  呉语伦比  阅读(572)  评论(0编辑  收藏  举报