获取中国比较有权重的网站

# encoding=utf-8
import re
import requests
from bs4 import BeautifulSoup

class getUrl(object):

    def __init__(self,num):
        self.totle = num
        self.myheader = {'Host': 'top.chinaz.com',
                         'Connection': ' keep-alive',
                    'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36',
                    'Accept':'*/*','Referer':'http://www.chinaz.com/',
                    'Accept-Encoding':'gzip, deflate, sdch','Accept-Language':'zh-CN,zh;q=0.8'}  # 表头信息

    def beginer(self):
        print 'get start'
        page = 2
        urlliset = []
        while page < 1680:
            url = 'http://top.chinaz.com/all/index_'+str(page)+'.html'
            r = requests.get(url,headers=self.myheader)
            soup = BeautifulSoup(r.text)
            list = soup.select('.col-gray')
            site = re.findall('<span.*?>(.*?)</span>',str(list))
            del site[0]
            for elem in site:
                urlliset.append(elem)
            page += 1
        self.writeQQ(text = urlliset,file_dir='site.text',mode='w')

    def writeQQ(self,text, file_dir, mode):
        with open(file_dir, mode) as f:
            for site in text:
                f.write(site)
                f.write("\n")




spidre = getUrl(44)
spidre.beginer()