python获取页面文字信息
# -*- coding: utf-8 -*- from selenium import webdriver import time, re,requests,os,time,random,traceback import urllib.request,threading from bs4 import BeautifulSoup import html.parser from tkinter import * from tkinter import ttk import tkinter.messagebox def getHtml(questionId,page): chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--start-maximized') # 最大化运行(全屏窗口),不设置,取元素会报错 chrome_options.add_argument('--disable-infobars') # 禁用浏览器正在被自动化程序控制的提示 chrome_options.add_argument('--incognito') # 隐身模式(无痕模式) chrome_options.add_argument('--headless') # 浏览器不提供可视化页面 driver = webdriver.Chrome(executable_path = "chromedriver",options=chrome_options) # 打开浏览器 driver.get("https://www.zhihu.com/question/"+questionId+"/answers/updated?page="+str(page)) # 打开想要爬取的知乎页面 # 模拟用户操作 def execute_times(times): for i in range(times): print('第'+str(i)+'次点击') driver.execute_script("window.scrollTo(0, "+str(1000 * i)+");") time.sleep(3) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") execute_times(12) result_raw = driver.page_source # 这是原网页 HTML 信息 result_soup = BeautifulSoup(result_raw, 'html.parser')# 然后将其解析 result_bf = result_soup.prettify() # 结构化原 HTML 文件 answers = driver.find_elements_by_class_name("RichContent-inner") txt = "start\n" for answer in answers: if len(answer.text) > 300: txt = txt + answer.text + "\n-----------我是分隔符------\n" with open(questionId +"/page_"+str(page)+".txt", 'w',encoding="utf-8") as zhpage: # 存储路径里的文件夹需要事先创建。 zhpage.write(txt) zhpage.close() print("爬取回答页面成功!!!") driver.quit() return result_soup def readTxt(path): f = open(path,'r',encoding='utf-8') strTxt = f.read() f.close() return strTxt def main(questionId,startPage,endPage): mkdir([questionId]) for i in range(startPage,endPage): try: getHtml(questionId,i) time.sleep(random.choice(range(5,8))) except Exception: traceback.print_exc() pass def mkdir(paths): for path in paths: if not os.path.exists(path): os.mkdir(path) def getanswer(): questionId = var_id.get() start = var_start.get() end = var_end.get() main(questionId,start,end) if __name__ == '__main__': main(str(308829198),101,200) tk = Tk() tk.title('获取知乎问题所有答案') tk.geometry('600x150') frame = Frame(tk) Label(tk,text='问题标识:(例:https://www.zhihu.com/question/324405640/answer/720532471中的324405640 )',width=200,anchor=W, justify=LEFT).place(x=10,y=10) var_id = Variable() question_id = Entry(tk,textvariable=var_id,width=30) question_id.place(x=10,y=40) Label(tk,text='开始页:').place(x=230,y=40) var_start = Variable() e = Entry(tk, textvariable=var_start,width=10).place(x=290,y=40) var_start.set(1) Label(tk,text='结束页:').place(x=360,y=40) var_end = Variable() e = Entry(tk, textvariable=var_end,width=10).place(x=420,y=40) var_end.set(10) Button(tk, text="获取答案", command=getanswer).place(x=200,y=80) #tk.mainloop()