# encoding=utf-8
import re
import requests

class getUrl(object):

    def __init__(self,num):
        self.totle = num
        self.myheader = {'Host': 'www.wooyun.org',
                         'Connection': ' keep-alive',
                    'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36',
                    'Accept':'*/*','Referer':'http://www.wooyun.com/',
                    'Accept-Encoding':'gzip, deflate, sdch','Accept-Language':'zh-CN,zh;q=0.8'}  # 表头信息

    def beginer(self):
        print 'get start'
        page = 1
        urlliset = []
        while page < 45:
            url = 'http://www.wooyun.org/corps/page/'+str(page)
            r = requests.get(url,headers=self.myheader)
            site = re.findall('href="http://(.*?)"',r.text)
            site = re.findall('(!www.)(.*?)',r.text)
            site2 = re.findall('href="https://(.*?)"',r.text)
            page += 1
            for elem in site:
                urlliset.append(elem)
            for elem in site2:
                urlliset.append(elem)
        self.writeQQ(text = urlliset,file_dir='site.text',mode='w')

    def writeQQ(self,text, file_dir, mode):
        with open(file_dir, mode) as f:
            for site in text:
                f.write(site)
                f.write("\n")




spidre = getUrl(44)
spidre.beginer()