爬取途虎车主文档标题链接并写入excel
# -*- coding: utf-8 -*- # __title__ = '爬取途虎车主文档标题链接并写入excel.py' # __author__ = 'yangyang' # __mtime__ = '2018.03.22' ''' 爬取网站视频 视频地址:http://www.budejie.com/video/ r:正则转义 re.S 匹配换行符 ''' import requests,re import time,os import xlrd,xlwt from concurrent.futures import ProcessPoolExecutor,ThreadPoolExecutor # 获取网页, def get_page(url): print('<%s> is getting [%s]' %(os.getpid(),url)) response=requests.get(url) response_contents = response.text re_content = re.compile(r'<div class="title">(.*?)</a></div>',re.S) url_content = re.findall(re_content,response_contents) #list article_dict = {} for content in url_content: re_url = re.compile(r'/(.*?\.[a-z]{4})', re.S) #匹配网页链接 re_title = re.compile(r'[\u4e00-\u9fa5]', re.S) #匹配标题 sub_url = re.findall(re_url, content) #查找链接 title_content = re.findall(re_title, content) #查找标题 title = ''.join(title_content) url = 'https://www.tuhu.cn/%s' % (sub_url[0]) article_dict[title] = url return article_dict # # 解析网页 def parse_page(res): global content_dic res = res.result() content_dic.update(res) def write_excel(res): f = xlwt.Workbook(encoding='utf-8') sheet1 = f.add_sheet(u'车主问答', cell_overwrite_ok=True) row0 = [u'标题', u'链接'] for i in range(0, len(row0)): sheet1.write(0, i, row0[i]) for ind,key in enumerate(res,1): sheet1.write(ind,0,key) # sheet1.write(行数,列,value) sheet1.write(ind,1,res[key]) f.save('车主问答.xls') if __name__ == '__main__': urls = [] content_dic = {} for i in range(1,101): url = 'https://www.tuhu.cn/Community/Discovery.aspx?tagId=1&pageIndex=%s'%i urls.append(url) pool = ThreadPoolExecutor() for url_a in urls: pool.submit(get_page,url_a).add_done_callback(parse_page) pool.shutdown() write_excel(content_dic)