【shikaobang】 python爬虫脚本
""" 事考帮更新url加密数字后,无法解码。只能用【<div class="title">相关推荐</div>】里面的链接来处理 解决办法:相关推荐是按题目顺序排列,以最后一个为起始网址,不断循环复制加密编码,起到原来的效果 """ import pandas as pd import urllib import urllib2 from bs4 import BeautifulSoup import codecs import re a1 = 101500 #需要自己修改起始值 urlname_list = [] url_name_start = u'/questionbank/5YmJvWgYm6' #填入查询到开始的urlname url_name_end = u'/questionbank/G5mbgoM1aX' #填入查询到最后的urlname urlname_list.append(url_name_start) a = 1 b = 1 while True: url_name = "http://www.shikaobang.cn" + url_name_start user_agent = "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/2011122 Ubuntu/10.10 (maverick) Firefox/2.5.1" request = urllib2.Request(url_name, headers={'User-Agent':user_agent}) html = urllib2.urlopen(request) html_data = BeautifulSoup(html,"html.parser") if html_data.find(name='a') is None: urlname_list.pop() url_name_start = urlname_list[-1] print "网页抓取失败,此时网址为:" + url_name_start continue for m in html_data.find_all(href=re.compile("/questionbank/")) : if m['href'] == url_name_end: urlname_list.append(m['href']) break else: urlname_list.append(m['href']) a = a + 1 url_name_start = urlname_list[-1] if url_name_end == url_name_start: break print u"网页抓取成功,此时网址为:" + url_name_start print u"查询结果共" + str(a) + u"条" print u"最终查询结果共" + str(a) + u"条" print u'开始爬取网页' #爬取网页 import pandas as pd import urllib import urllib2 from bs4 import BeautifulSoup import codecs import time time_start=time.time() """ 修改题目对应网页数值 """ a2 = a1 for i in urlname_list: try: url_name = "http://www.shikaobang.cn" + i user_agent = "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/2011122 Ubuntu/10.10 (maverick) Firefox/2.5.1" request = urllib2.Request(url_name, headers={'User-Agent':user_agent}) html = urllib2.urlopen(request) f = codecs.open('html/sz_'+str(a1),'w') f.write(html.read()) f.close() a1 = a1 + 1 except: print i pass continue print "下次使用该编码作为起始值:" + str((int(a1/100)+1)*100) print "爬取网页结束,开始处理文本" # -*- coding: utf-8 -*- def html_chuli(html): html_data = BeautifulSoup(html) t_miaosu = html_data.find(attrs={'name':'description'})['content'] #题目描述 t_news_title = html_data.find_all(attrs={'class':'news-content-title'}) t_news_typs = html_data.find_all(attrs={'class':'news-typs'}) t_news_time = html_data.find_all(attrs={'class':'news-time'}) tdata1 = html_data.find("div", attrs={'class':'main-content'})#抓取第一个框架 if tdata1: t_leixing = tdata1.select('span')[0].string #题目类型 t_content = tdata1.select('div.question-title')[0].string #题目内容 注:id是#;name是. t_xueze = tdata1.select('div.question-item') #题目所有选项 x_ABCD = [] #选项ABCD x_content = [] #选项ABCD对应内容 z_xueze = [] #正确选项 for item in t_xueze: item_middle = item.get_text().split() x_ABCD.append(item_middle[:1]) x_content.append(item_middle[1:]) for item in tdata1.select('label.actives'):#选择 z_xueze.append(item.string) for item in tdata1.select('div.question-item.correct i'):#判断 z_xueze.append(item.string) return t_miaosu,t_leixing,t_content,x_ABCD,x_content,z_xueze,t_news_title,t_news_typs,t_news_time else: return '0' #文本处理 import pandas as pd import urllib import urllib2 import re import json import random from bs4 import BeautifulSoup import codecs """ 修改提取后对应文本编码 """ for i in range(a2,a1): try: with open('html/sz_'+str(i), 'r') as f: s_1 = "" s_2 = "" t_n = "" contents = f.read().decode("utf-8", "ignore") #处理� t_miaosu,t_leixing,t_content,x_ABCD,x_content,z_xueze,t_news_title,t_news_typs,t_news_time = html_chuli(contents) for m in range(len(x_ABCD)): if x_ABCD[m][0]: s1 = x_ABCD[m][0] else: s1="" if x_content[m][0]: s2 = x_content[m][0] else: s2="" s_1 = s_1 + s1 + ":" + s2 + " " for n in range(len(z_xueze)): s_2 = s_2 + z_xueze[n].strip() for z in range(len(t_news_title)): if t_news_title[z]: new1 = t_news_title[z].text else: new1="" if t_news_typs[z]: new2 = t_news_typs[z].text else: new2="" if t_news_time[z]: new3 = t_news_time[z].text else: new3="" t_n = t_n + new1 + "|" + new2 + "|" + new3 + "&" if t_leixing is None: continue k1 = str(i) + "#" + t_miaosu.replace("\n", "") + "#" + t_leixing + "#" + t_content.replace(" ", "").replace("\n", "") + "#" + s_1.replace("\n", "") + "#" + s_2.replace("\n", "") + "#" + t_n.replace("\n", "") f1 = codecs.open(u'out/时政202011-20210325.txt','a',encoding="utf-8") #修改导出txt文件编号 f1.write(k1 + "\n") except: f2 = codecs.open('out/fail_num.txt','a',encoding="utf-8") k2 = str(i) f2.write(k2 + "\n") print str(i) + u"号html文件导入失败!" f2.close() pass continue f1.close() print u"处理完毕!再次执行请修改“输出文件名”,并保存py文件,然后重新开始!!!"
此代码仅纪念作用,目前已不可用
study just for life!