贴吧爬取

# coding=utf-8
import requests
import re
from requests_html import HTMLSession
import pandas as pd
import time

session = HTMLSession()

headers = {
    'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
    'Cookie': 'BAIDUID=0AD95F29B28B1C69CF12212918D35FC5:FG=1; BDUSS=xRTTRqU2poYXJxZmx5bTF0dm5iVERtdWRnTC1hbDJIbnltcGlOcmtuejk1VDViQVFBQUFBJCQAAAAAAAAAAAEAAAC4ED841cW4o8H6MjAxM8zs0KsAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP1YF1v9WBdbV0; BIDUPSID=0AD95F29B28B1C69CF12212918D35FC5; PSTM=1528257025; TIEBAUID=eaa5821fe8cd6332e9f74ebe; TIEBA_USERTYPE=4fe0d47f0a8a56b9153531e1; bdshare_firstime=1529484152117; STOKEN=fb86f516529f2e700875d976398014ccffa45fc25536938272acb3cef065221a; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; UM_distinctid=1651dfa3911746-0d266b95e90a93-163f6952-13c680-1651dfa391242f; Hm_lvt_addc40d255fca71b9b06a07c2397b42a=1533006153,1533094604,1533611406,1533637141; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; PSINO=2; H_PS_PSSID=1421_21080_26921_20927; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1533711844,1533711977,1533781285,1533868202; 943657144_FRSVideoUploadTip=1; mo_originid=2; IS_NEW_USER=121622ee0999d777aa2e3fa8; BAIDU_WISE_UID=wapp_1533868860558_698; CLIENTWIDTH=375; CLIENTHEIGHT=667; LASW=375; fixedbarautopop=1; recommend_item_click=0; wise_device=1; pb_prompt=1; SET_PB_IMAGE_WIDTH=355; SEENKW=%E6%89%AB%E7%A0%81%23%C9%A8%C2%EB; CNZZDATA1272960286=201730737-1529483780-null%7C1533869631; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1533870061'
}


url_first = 'https://tieba.baidu.com/mo/q/m?kw=%E6%89%AB%E7%A0%81&pn=0&lp=5024&forum_recommend=1&lm=0&cid=0&has_url_param=0&pn={}&is_ajax=1'
all_first_urls = [url_first.format(50*i) for i in range(1,72)]

all_fina_url = []

def gen_all_urls(url):
    url_demo1 = 'https://tieba.baidu.com/mo/q/m?kw=%E6%89%AB%E7%A0%81&pn=0&lp=5024&forum_recommend=1&lm=0&cid=0&has_url_param=0&pn=50&is_ajax=1'

    res = requests.get(url, headers=headers)
    aa = res.json()['data']['content']

    bb = re.findall('href="(/p/\d+\?lp=5027&mo_device=1&is_jingpost=0)"', aa)

    all_url = ['https://tieba.baidu.com' + i for i in bb]
    all_fina_url.extend(all_url)


all_fina_data = []


def get_single(url):
    info = {}
    url_demo2 = 'https://tieba.baidu.com/p/5819837590?lp=5027&mo_device=1&is_jingpost=0&pn=0&'
    r = session.get(url,headers=headers)

    all_text = r.html.find('div[lz="0"]')
    for i in all_text:
        info['回复']=i.text
        len(all_fina_data)
        all_fina_data.append(info)


if __name__ == '__main__':
    for first_url in all_first_urls:
        gen_all_urls(first_url)

    for fina_url in all_fina_url:
        get_single(fina_url)

    df1 = pd.DataFrame(all_fina_data)

    df1.to_excel('扫码贴吧信息'+ time.strftime("%Y%m%d%H%M") + '.xlsx',
        index=False)
    print('done')

 

posted @ 2018-08-17 10:18  Erick-LONG  阅读(199)  评论(0编辑  收藏  举报