一直有这个想法做,可是奈何没有时间,周日晚上突然特别想做这个,然后一晚上边嗑啤酒边敲代码到凌晨五点多终于写完了这个原型,4个小时大概提交了四百多次过了一百多个题目,正确率就那样哈哈,代码没有处理查重问题,由于某些因素有时候会中断,之后有时间再做改进
稍微讲一下原理,登录->获取题目->根据题目百度爬url->筛选CSDN的url用无头浏览器爬取代码->提交代码->判断答案->错误继续爬,对了下一题
学校模拟oj登录啥的比较简单,就是密码MD5加密,然后有个csdf要注意一下,看好久都没有找到可以越权的地方,比较尴尬,
因为CSDN的内置搜索还没有百度好用,登录后爬到题目和内容再去爬百度,爬取百度参考了博客上面的,但是要注意的是百度给的url是重定向的,要处理一下得到真实url,然后再去爬CSDN,由于CSDN用了js加密,而且蛮复杂网上也没有找到案例,就用无头浏览器了,没什么技术含量,哈哈
代码贴上,要运行的话要装一下谷歌无头浏览器的驱动
#!/usr/bin/python # -*- coding: utf-8 -*- import requests import urllib.parse import urllib import hashlib, binascii import time import chardet from lxml import etree import re import json from selenium import webdriver from selenium.webdriver.common.keys import Keys # 导入chrome选项 from selenium.webdriver.chrome.options import Options from requests.exceptions import RequestException from urllib.parse import urljoin global cs user={'id':'','passw':''} # 百度搜索接口 chrome_options = Options()#无头浏览器 chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') driver = webdriver.Chrome(chrome_options=chrome_options) driver.set_page_load_timeout(10) driver.set_script_timeout(10) s=requests.session() headers={'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:55.0) Gecko/20100101 Firefox/55.0'} def saveHtml(file_name, file_content): with open(file_name.replace('/', '_') + ".html", "wb") as f: f.write(str.encode(file_content)) def format_url(url, params: dict=None) -> str: query_str = urllib.parse.urlencode(params) return f'{ url }?{ query_str }' def get_url(keyword): params = { 'wd': str(keyword) } url = "https://www.baidu.com/s" url = format_url(url, params) # print(url) return url def get_page(url): try: headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'max-age=0', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' } response = requests.get(url=url,headers=headers) # 更改编码方式,否则会出现乱码的情况 response.encoding = "utf-8" print(response.status_code) # print(response.text) if response.status_code == 200: return response.text return None except RequestException: return None def parse_page(url,page): for i in range(1,int(page)+1): print("正在爬取第{}页....".format(i)) title = "" sub_url = "" abstract = "" flag = 11 if i == 1: flag = 10 html = get_page(url) content = etree.HTML(html) for j in range(1,flag): data = {} res_title = content.xpath('//*[@id="%d"]/h3/a' % ((i - 1) * 10 + j)) if res_title: title = res_title[0].xpath('string(.)') sub_url = content.xpath('//*[@id="%d"]/h3/a/@href' % ((i - 1) * 10 + j)) if sub_url: sub_url = sub_url[0] res_abstract = content.xpath('//*[@id="%d"]/div[@class="c-abstract"]'%((i-1)*10+j)) if res_abstract: abstract = res_abstract[0].xpath('string(.)') else: res_abstract = content.xpath('//*[@id="%d"]/div/div[2]/div[@class="c-abstract"]'%((i-1)*10+j)) if res_abstract: abstract = res_abstract[0].xpath('string(.)') # res_abstract = content.xpath('//*[@id="%d"]/div/div[2]/p[1]'%((i-1)*10+j)) # if not abstract: # abstract = content.xpath('//*[@id="%d"]/div/div[2]/p[1]'%((i-1)*10+j))[0].xpath('string(.)') data['title'] = title data['sub_url'] = sub_url data['abstract'] = abstract rel_url = content.xpath('//*[@id="page"]/a[{}]/@href'.format(flag)) if rel_url: url = urljoin(url, rel_url[0]) else: print("无更多页面!~") return yield data def get_real(o_url): #获取重定向url指向的网址 r = requests.get(o_url, allow_redirects=False) # 禁止自动跳转 if r.status_code == 302: try: return r.headers['location'] # 返回指向的地址 except: pass return o_url #Repairing a Road CSDN #https://blog.csdn.net/Yellow_python/article/details/81107273 def ans_code(url): url=get_real(url) Get = requests.get(url) #获取show代码网页 try: driver.get(url) time.sleep(3) html1=etree.HTML(driver.page_source) except: pass code='nil' #saveHtml(url,tt) # print(tt) try: driver.find_element_by_id('btn-readmore').click() code = driver.find_element_by_class_name('language-cpp').text print(code) except: pass return code def get_md5_value(str): #获取密码的md5十六位值 my_md5 = hashlib.md5() my_md5.update(str.encode(encoding='utf-8')) my_md5_Digest = my_md5.hexdigest() print(my_md5_Digest) return my_md5_Digest def csrf(): #获取csrf值 firsturl='http://acm.hnucm.edu.cn/JudgeOnline/csrf.php' get=s.get(firsturl) html=get.text t=re.compile(r'name="csrf" value="(.+)" ') resu=t.findall(html) return resu[0] def login_oj(id,pa): global cs paprm={'user_id':id,'password':get_md5_value(pa),'csrf':cs} s.post('http://acm.hnucm.edu.cn/JudgeOnline/login.php',data=paprm) stt=s.get('http://acm.hnucm.edu.cn/JudgeOnline/submitpage.php?id=1100&sid=48970').text #saveHtml('1112', stt) def submit_code(code,id): global cs url='http://acm.hnucm.edu.cn/JudgeOnline/submit.php' codes={'id':id,'language':'1','source':code,'reverse2':'reverse','csrf':cs} s.post(url,data=codes) #saveHtml('123123', stt) def get_pro(id): url='http://acm.hnucm.edu.cn/JudgeOnline/problem.php?id='+id tt=s.get(url).text ss=re.compile(r'</title><center><h2>(.+)</h2>',flags=re.DOTALL) si=re.compile(r'<span style="(.+?)<',flags=re.DOTALL) tit=ss.findall(tt) sis=si.findall(tt) print(tit) pt='00' try : pt=tit[0] except : pass for i in sis: try : print(i) pt=pt+i[18:] except: pass print(pt) return pt def is_ac(): url='http://acm.hnucm.edu.cn/JudgeOnline/status.php?user_id=201701020135' tt=s.get(url).text ss=re.compile(r'class=\'label label-(.*?)\' title=\'',flags=re.DOTALL) ff=ss.findall(tt)[0] if(ff=='success'): print("ACACAC啦!!") return True else : print(ff) return False def main(): global cs global user user['id']=input('账号:') user['passw']=input('密码:') cs=csrf() login_oj(user['id'],user['passw']) for id in range(1100,1400): tit=get_pro(str(id)) if(tit=='00'): continue ke=tit url = get_url(ke) results = parse_page(url,2) # 写入文件 #file = open("data.json", 'w+', encoding='utf-8') for result in results: print(result) if(get_real(result['sub_url'])[0:21]=='https://blog.csdn.net'): #file.write(json.dumps(result, indent=2, ensure_ascii=False)) try : code=ans_code(result['sub_url']) if(code!='nil'): submit_code(code, str(id)) time.sleep(8) if is_ac(): break; except: pass if __name__ == '__main__': main()