【fenbi】python爬虫用于做题复习

#!/usr/bin/env python
# coding: utf-8

###此函数用于解析题目和每道题的答案
def jiexi(liebiao):
 new = []
 timu_last = []
 for each in liebiao:
  new.append(re.sub(r'flag=\\"tex\\" ','',each))
 for each in new:
  timu_last.append(re.sub(r'\\','',each))
 return timu_last
###此函数用于解析选项
def xuanxiang(liebiao):
 xuanxiang_v2 = []
 xuanxiang_v3 = []
 for each in liebiao:
  a = re.sub('<p>','',each)
  a = re.sub('</p>','',a)
  xuanxiang_v2.append(a)
 for each in xuanxiang_v2:
  each = each+'</p>'
  xuanxiang_v3.append(each)
 return xuanxiang_v3
import requests
import re
#import pdfkit
import os
import codecs
print "请输入练习的网址:"
url = '填入粉笔网你做的15道题的网址'#raw_input(unicode('网址','utf-8').encode('gbk'))
print "url:"+url
###获取本节练习id
id_ = re.findall(r'https://www.fenbi.com/spa/tiku.*?/xingce/xingce/(.*?)/',url,re.S)[0]
mid_url = 'https://tiku.fenbi.com/api/xingce/exercises/'+str(id_)+'?app=web&kav=12&version=3.0.0.0'
print "mid_url:"+mid_url
headers = {
       'Cookie': '你自己的cookie; persistent=你的信息; sess=你的信息; userid=你的id'
#####完整的headers
}
response = requests.get(url=mid_url,headers=headers)
response.encoding = 'utf-8'
page_text_1 = response.text
###获取题目组参数
id_list = re.findall('\"questionIds\"\:\[(.*?)\]\,',page_text_1,re.S)

###获取已答题目的编码(排除没做的题)
your_answerid = re.findall(r'"(.{1,2})":{"questionId',page_text_1,re.S)

###获取自己的答案
your_answer = re.findall(r'"answer":{"choice":"(.*?)",',page_text_1,re.S)

###此练习名称
name = re.findall(r'"name":"(.*?)",',page_text_1,re.S)[0]
###真正存储数据的包
timu_url = 'https://tiku.fenbi.com/api/xingce/questions'
params = {
 'ids': id_list
}
response = requests.get(url=timu_url,headers=headers,params=params)
response.encoding = 'utf-8'
page_text_2 = response.text #【问题】文本数据
###获取正确答案
true_answer = re.findall('"correctAnswer":{"choice":"(.*?)"',page_text_2,re.S)
###真正存储数据的包
solution_url = 'https://tiku.fenbi.com/api/xingce/solutions'
response = requests.get(url=solution_url,headers=headers,params=params)
response.encoding = 'utf-8'
page_text_3 = response.text#【解析】文本数据
###获取解析
solution_list = re.findall('"solution":"(.*?)","source"',page_text_3,re.S) #正则匹配以solution开始,source结束
solution_last = jiexi(solution_list)
cailiao = []
timu = []
###获取单选题题目和复合题的题目
for each in response.json():
 timu.append(each['content'])
 try:
  cailiao.append(each['material']['content'])
 except:
  cailiao.append('none')
###获取选项信息
A_option = re.findall('\"options\"\:\[\"(.*?)\"\,\".*?\"\,\".*?\"\,\".*?\"\]',page_text_2,re.S)
B_option = re.findall('\"options\"\:\[\".*?\"\,\"(.*?)\"\,\".*?\"\,\".*?\"\]',page_text_2,re.S)
C_option = re.findall('\"options\"\:\[\".*?\"\,\".*?\"\,\"(.*?)\"\,\".*?\"\]',page_text_2,re.S)
D_option = re.findall('\"options\"\:\[\".*?\"\,\".*?\"\,\".*?\"\,\"(.*?)\"\]',page_text_2,re.S)
"""
A_option = xuanxiang(A_option)
B_option = xuanxiang(B_option)
C_option = xuanxiang(C_option)
D_option = xuanxiang(D_option)
A_option = jiexi(A_option)
B_option = jiexi(B_option)
C_option = jiexi(C_option)
D_option = jiexi(D_option)
"""



###构造HTML代码
count = 0
count_1 = 0
all_content = "<!DOCTYPE html>\n<meta charset='utf-8'>\n<html>"
all_content_0 = "<!DOCTYPE html>\n<meta charset='utf-8'>\n<html>"
print 'true_answer ',true_answer
print 'your_answer ',your_answer
print 'your_answerid ',your_answerid
for each in true_answer:
 if your_answerid[count_1]==str(count):
  if each != your_answer[count_1]:
   ###处理复合题
   if cailiao[count_1] != 'none' and cailiao[count_1] not in all_content:
    all_content += cailiao[count_1]
    all_content += str(count_1+1)
    all_content += '、'
    all_content += timu[count_1][3:]
    all_content += 'A、'
    all_content += A_option[count_1]
    all_content += 'B、'
    all_content += B_option[count_1]
    all_content += 'C、'
    all_content += C_option[count_1]
    all_content += 'D、'
    all_content += D_option[count_1]
    all_content += '<br>'
  if count_1 < len(your_answerid)-1:
   count_1 += 1
 if count < len(true_answer)-1:
  count += 1
count = 0
count_1 = 0
all_content += '<br>'
for each in true_answer:
 if your_answerid[count_1]==str(count):
  each_res = your_answer[count_1]
  if count_1 < len(your_answerid)-1:
   count_1 += 1
 else:
  each_res = "NULL_ANSWER"
 if each != each_res:
  #如果答案中含有img属性,则添加缺失部分正常显示图片
  if A_option[count].find('img') != -1:
    A_option_list = list(A_option[count])
    nPos_1=A_option[count].index('"//fb.')-1
    nPos_2=A_option[count].index('formulas?')+8 #插入此字符串尾部
    nPos_3=A_option[count].index('\" /></p>')-1
    A_option_list.pop(nPos_1)
    A_option_list.insert(nPos_2,'fontSize=18&')
    A_option_list.pop(nPos_3)
    A_option[count] = ''.join(A_option_list)
  if B_option[count].find('img') != -1:
    B_option_list = list(B_option[count])
    nPos_1=B_option[count].index('"//fb.')-1
    nPos_2=B_option[count].index('formulas?')+8 #插入此字符串尾部
    nPos_3=B_option[count].index('\" /></p>')-1
    B_option_list.pop(nPos_1)
    B_option_list.insert(nPos_2,'fontSize=18&')
    B_option_list.pop(nPos_3)
    B_option[count] = ''.join(B_option_list)
  if C_option[count].find('img') != -1:
    C_option_list = list(C_option[count])
    nPos_1=C_option[count].index('"//fb.')-1
    nPos_2=C_option[count].index('formulas?')+8 #插入此字符串尾部
    nPos_3=C_option[count].index('\" /></p>')-1
    C_option_list.pop(nPos_1)
    C_option_list.insert(nPos_2,'fontSize=18&')
    C_option_list.pop(nPos_3)
    C_option[count] = ''.join(C_option_list)
  if D_option[count].find('img') != -1:
    D_option_list = list(D_option[count])
    nPos_1=D_option[count].index('"//fb.')-1
    nPos_2=D_option[count].index('formulas?')+8 #插入此字符串尾部
    nPos_3=D_option[count].index('\" /></p>')-1
    D_option_list.pop(nPos_1)
    D_option_list.insert(nPos_2,'fontSize=18&')
    D_option_list.pop(nPos_3)
    D_option[count] = ''.join(D_option_list)
  all_content   =  all_content   + u'第'+str(count+1)+u'题' + timu[count] + A_option[count] + '***' + B_option[count] + '***' + C_option[count] + '***' + D_option[count] + '<br>'
  all_content_0 =  all_content_0 + u'第'+str(count+1)+u'题' + timu[count] + A_option[count] + '***' + B_option[count] + '***' + C_option[count] + '***' + D_option[count] + '<br>'
  temp = u'第'+str(count+1)+u'题的正确答案为'
  all_content += temp
  if true_answer[count]=='0':
   all_content += 'A'
  elif true_answer[count]=='1':
   all_content += 'B'
  elif true_answer[count]=='2':
   all_content += 'C'
  elif true_answer[count]=='3':
   all_content += 'D'
  all_content +=  solution_last[count]
  all_content += '<br>'
 if count < len(true_answer)-1:
  count += 1
all_content += '</html>'
all_content_0 += '</html>'
path_name_0 = 'G:\\lqj\\' + name + u'(题目).html'
path_name_1 = 'G:\\lqj\\' + name + u'(解析).html'
###保存为HTML文件
with codecs.open(path_name_0,'w',encoding='utf-8') as fp:
 fp.write(all_content_0)
with codecs.open(path_name_1,'w',encoding='utf-8') as fp:
 fp.write(all_content)
###删除HTML文件
#os.remove(path_name)
print "完成任务"

代码来源

Python爬虫自动化获取华图和粉笔网站的错题(推荐)

参照这位博主的代码,进行部分修正,适合自己使用,望各位惠存

posted @ 2022-02-17 10:48  CQ_LQJ  阅读(337)  评论(0编辑  收藏  举报