python获取页面文字信息

# -*- coding: utf-8 -*-

from selenium import webdriver
import time, re,requests,os,time,random,traceback
import urllib.request,threading
from bs4 import BeautifulSoup
import html.parser
from tkinter import *
from tkinter import ttk
import tkinter.messagebox 


def getHtml(questionId,page):
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--start-maximized')  # 最大化运行(全屏窗口),不设置,取元素会报错
    chrome_options.add_argument('--disable-infobars')  # 禁用浏览器正在被自动化程序控制的提示
    chrome_options.add_argument('--incognito')  # 隐身模式(无痕模式)
    chrome_options.add_argument('--headless')  # 浏览器不提供可视化页面

    driver = webdriver.Chrome(executable_path = "chromedriver",options=chrome_options)  # 打开浏览器
    driver.get("https://www.zhihu.com/question/"+questionId+"/answers/updated?page="+str(page)) # 打开想要爬取的知乎页面 

    # 模拟用户操作
    def execute_times(times):
        for i in range(times):
            print(''+str(i)+'次点击') 
            driver.execute_script("window.scrollTo(0, "+str(1000 * i)+");")
            time.sleep(3)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")    
    execute_times(12)

    result_raw = driver.page_source  # 这是原网页 HTML 信息
    result_soup = BeautifulSoup(result_raw, 'html.parser')# 然后将其解析
    result_bf = result_soup.prettify()  # 结构化原 HTML 文件
    answers = driver.find_elements_by_class_name("RichContent-inner")
    txt = "start\n"
    for answer in answers:
        if len(answer.text) > 300:
           txt = txt + answer.text + "\n-----------我是分隔符------\n"
    with open(questionId +"/page_"+str(page)+".txt", 'w',encoding="utf-8") as zhpage:  # 存储路径里的文件夹需要事先创建。
        zhpage.write(txt)
    zhpage.close()
    print("爬取回答页面成功!!!")
    driver.quit()
    return result_soup

def readTxt(path):
    f = open(path,'r',encoding='utf-8')
    strTxt = f.read()
    f.close()
    return strTxt
        

def main(questionId,startPage,endPage):
    mkdir([questionId])
    for i in range(startPage,endPage):
        try:
           getHtml(questionId,i)
           time.sleep(random.choice(range(5,8)))
        except Exception:
            traceback.print_exc()
            pass

def mkdir(paths):
    for path in paths:
        if not os.path.exists(path):
            os.mkdir(path)

def getanswer():
    questionId = var_id.get()
    start = var_start.get()
    end = var_end.get()
    main(questionId,start,end)

if __name__ == '__main__':
    main(str(308829198),101,200)


tk = Tk()
tk.title('获取知乎问题所有答案')
tk.geometry('600x150')

frame = Frame(tk)
Label(tk,text='问题标识:(例:https://www.zhihu.com/question/324405640/answer/720532471中的324405640 )',width=200,anchor=W, justify=LEFT).place(x=10,y=10)
var_id = Variable()
question_id = Entry(tk,textvariable=var_id,width=30)
question_id.place(x=10,y=40)

Label(tk,text='开始页:').place(x=230,y=40)
var_start = Variable()
e = Entry(tk, textvariable=var_start,width=10).place(x=290,y=40)
var_start.set(1)


Label(tk,text='结束页:').place(x=360,y=40)
var_end = Variable()
e = Entry(tk, textvariable=var_end,width=10).place(x=420,y=40)
var_end.set(10)

Button(tk, text="获取答案", command=getanswer).place(x=200,y=80)
#tk.mainloop()

 

posted @ 2019-08-22 19:42  凉城  阅读(6106)  评论(0编辑  收藏  举报