爬取途虎车主文档标题链接并写入excel

# -*- coding: utf-8 -*-

# __title__ = '爬取途虎车主文档标题链接并写入excel.py'
# __author__ = 'yangyang'
# __mtime__ = '2018.03.22'

'''
爬取网站视频
视频地址:http://www.budejie.com/video/
r:正则转义
re.S  匹配换行符
'''
import requests,re
import time,os
import xlrd,xlwt
from concurrent.futures import ProcessPoolExecutor,ThreadPoolExecutor

# 获取网页,
def get_page(url):
    print('<%s> is getting [%s]' %(os.getpid(),url))
    response=requests.get(url)
    response_contents = response.text
    re_content = re.compile(r'<div class="title">(.*?)</a></div>',re.S)
    url_content = re.findall(re_content,response_contents)  #list

    article_dict = {}
    for content in url_content:
        re_url = re.compile(r'/(.*?\.[a-z]{4})', re.S)      #匹配网页链接
        re_title = re.compile(r'[\u4e00-\u9fa5]', re.S)     #匹配标题
        sub_url = re.findall(re_url, content)               #查找链接
        title_content = re.findall(re_title, content)       #查找标题
        title = ''.join(title_content)
        url = 'https://www.tuhu.cn/%s' % (sub_url[0])
        article_dict[title] = url
    return article_dict
# # 解析网页



def parse_page(res):
    global content_dic
    res = res.result()
    content_dic.update(res)

def write_excel(res):

    f = xlwt.Workbook(encoding='utf-8')
    sheet1 = f.add_sheet(u'车主问答', cell_overwrite_ok=True)
    row0 = [u'标题', u'链接']
    for i in range(0, len(row0)):
        sheet1.write(0, i, row0[i])

    for ind,key in enumerate(res,1):
        sheet1.write(ind,0,key)         # sheet1.write(行数,列,value)
        sheet1.write(ind,1,res[key])

    f.save('车主问答.xls')



if __name__ == '__main__':
    urls = []
    content_dic = {}
    for i in range(1,101):
        url = 'https://www.tuhu.cn/Community/Discovery.aspx?tagId=1&pageIndex=%s'%i
        urls.append(url)

    pool = ThreadPoolExecutor()
    for url_a in urls:
        pool.submit(get_page,url_a).add_done_callback(parse_page)
    pool.shutdown()
    write_excel(content_dic)

 

posted @ 2018-03-22 17:22  Nice_keep-going  阅读(196)  评论(0编辑  收藏  举报