寻医问药 爬虫
import requests import re import pandas as pd def get_all_date_url(): all_url=[] for i in range(61): url = 'http://club.xywy.com/keshi/{}.html'.format(str(i+1)) res = requests.get(url) urls=re.findall(r"http://club.xywy.com/keshi/\d{4}-\d{2}-\d+/\d+\.html",res.text) all_url.extend(urls) return list(set(all_url)) def get_QA_url(url): all_QA_url=[] res = requests.get(url) res.encoding = 'gb2312' all_page = re.findall(r'共 (\d+) 页',res.text)[0] for i in range(int(all_page)): url1 = 'http://club.xywy.com/keshi/'+ url.split('/')[-2] + '/' + str(i+1) +'.html' all_QA_url.append(url1) return list(set(all_QA_url)) def main(): all_url_data = [] for i in get_all_date_url(): all_url_data.extend(get_QA_url(i)) info_list = [] for detail_url in all_url_data: final_dic_data = {} final_dic_data['url']=detail_url final_dic_data['患者标题']=xx final_dic_data['患者姓名']=xx final_dic_data['患者性别']=xx final_dic_data['提问日期']=xx final_dic_data['患者描述']=xx final_dic_data['医生姓名']=xx final_dic_data['医生职称']=xx final_dic_data['医生科室']=xx final_dic_data['问题分析']=xx final_dic_data['回答时间']=xx info_list.append(final_dic_data) df =pd.DataFrame(info_list) df.to_excel('xunyiwenyao.xlsx',index=False) if __name__ == '__main__': mian()