一直有这个想法做,可是奈何没有时间,周日晚上突然特别想做这个,然后一晚上边嗑啤酒边敲代码到凌晨五点多终于写完了这个原型,4个小时大概提交了四百多次过了一百多个题目,正确率就那样哈哈,代码没有处理查重问题,由于某些因素有时候会中断,之后有时间再做改进

稍微讲一下原理,登录->获取题目->根据题目百度爬url->筛选CSDN的url用无头浏览器爬取代码->提交代码->判断答案->错误继续爬,对了下一题

学校模拟oj登录啥的比较简单,就是密码MD5加密,然后有个csdf要注意一下,看好久都没有找到可以越权的地方,比较尴尬,

因为CSDN的内置搜索还没有百度好用,登录后爬到题目和内容再去爬百度,爬取百度参考了博客上面的,但是要注意的是百度给的url是重定向的,要处理一下得到真实url,然后再去爬CSDN,由于CSDN用了js加密,而且蛮复杂网上也没有找到案例,就用无头浏览器了,没什么技术含量,哈哈

代码贴上,要运行的话要装一下谷歌无头浏览器的驱动

#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
import urllib.parse
import urllib
import hashlib, binascii
import time
import chardet 
from lxml import etree
import re
import json
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
# 导入chrome选项
from selenium.webdriver.chrome.options import Options
from requests.exceptions import RequestException
from urllib.parse import urljoin


global cs
user={'id':'','passw':''}
# 百度搜索接口
chrome_options = Options()#无头浏览器
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.set_page_load_timeout(10)
driver.set_script_timeout(10)
s=requests.session()
headers={'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:55.0) Gecko/20100101 Firefox/55.0'}

def saveHtml(file_name, file_content):
    with open(file_name.replace('/', '_') + ".html", "wb") as f:
        f.write(str.encode(file_content))

def format_url(url, params: dict=None) -> str:
    query_str = urllib.parse.urlencode(params)
    return f'{ url }?{ query_str }'

def get_url(keyword):
    params = {
        'wd': str(keyword)
    }
    url = "https://www.baidu.com/s"
    url = format_url(url, params)
    # print(url)

    return url

def get_page(url):
    try:
        headers = {
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cache-control': 'max-age=0',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
        }
        response = requests.get(url=url,headers=headers)
        # 更改编码方式,否则会出现乱码的情况
        response.encoding = "utf-8"
        print(response.status_code)
        # print(response.text)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

def parse_page(url,page):

    for i in range(1,int(page)+1):
        print("正在爬取第{}页....".format(i))
        title = ""
        sub_url = ""
        abstract = ""
        flag = 11
        if i == 1:
            flag = 10
        html = get_page(url)
        content = etree.HTML(html)
        for j in range(1,flag):
            data = {}
            res_title = content.xpath('//*[@id="%d"]/h3/a' % ((i - 1) * 10 + j))
            if res_title:
                title = res_title[0].xpath('string(.)')

            sub_url = content.xpath('//*[@id="%d"]/h3/a/@href' % ((i - 1) * 10 + j))
            if sub_url:
                sub_url = sub_url[0]

            res_abstract = content.xpath('//*[@id="%d"]/div[@class="c-abstract"]'%((i-1)*10+j))
            if res_abstract:
                abstract = res_abstract[0].xpath('string(.)')
            else:
                res_abstract = content.xpath('//*[@id="%d"]/div/div[2]/div[@class="c-abstract"]'%((i-1)*10+j))
                if res_abstract:
                    abstract = res_abstract[0].xpath('string(.)')
                    # res_abstract = content.xpath('//*[@id="%d"]/div/div[2]/p[1]'%((i-1)*10+j))
            # if not abstract:
            #     abstract = content.xpath('//*[@id="%d"]/div/div[2]/p[1]'%((i-1)*10+j))[0].xpath('string(.)')
            data['title'] = title
            data['sub_url'] = sub_url
            data['abstract'] = abstract


            rel_url = content.xpath('//*[@id="page"]/a[{}]/@href'.format(flag))
            if rel_url:
                url = urljoin(url, rel_url[0])
            else:
                print("无更多页面!~")
                return
            yield data

def get_real(o_url):
    #获取重定向url指向的网址
    r = requests.get(o_url, allow_redirects=False)  # 禁止自动跳转
    if r.status_code == 302:
        try:
            return r.headers['location']  # 返回指向的地址
        except:
            pass
    return o_url


#Repairing a Road CSDN
#https://blog.csdn.net/Yellow_python/article/details/81107273
def ans_code(url):
    url=get_real(url)
    Get = requests.get(url)
    #获取show代码网页
    try:
        driver.get(url)
        time.sleep(3)
        html1=etree.HTML(driver.page_source)
    except:
        pass
    code='nil'
    #saveHtml(url,tt)
   # print(tt)
    try:
        driver.find_element_by_id('btn-readmore').click()
        code = driver.find_element_by_class_name('language-cpp').text
        print(code)
    except:
        pass
    return code

def get_md5_value(str):
    #获取密码的md5十六位值
    my_md5 = hashlib.md5()
    my_md5.update(str.encode(encoding='utf-8'))
    my_md5_Digest = my_md5.hexdigest()
    print(my_md5_Digest)
    return my_md5_Digest

def csrf():
    #获取csrf值
    firsturl='http://acm.hnucm.edu.cn/JudgeOnline/csrf.php'
    get=s.get(firsturl)
    html=get.text
    t=re.compile(r'name="csrf" value="(.+)" ')
    resu=t.findall(html)
    return resu[0]

def login_oj(id,pa):
    global cs
    paprm={'user_id':id,'password':get_md5_value(pa),'csrf':cs}
    s.post('http://acm.hnucm.edu.cn/JudgeOnline/login.php',data=paprm)
    stt=s.get('http://acm.hnucm.edu.cn/JudgeOnline/submitpage.php?id=1100&sid=48970').text
    #saveHtml('1112', stt)

def submit_code(code,id):
    global cs
    url='http://acm.hnucm.edu.cn/JudgeOnline/submit.php'
    codes={'id':id,'language':'1','source':code,'reverse2':'reverse','csrf':cs}
    s.post(url,data=codes)
    #saveHtml('123123', stt)
    
def get_pro(id):
    url='http://acm.hnucm.edu.cn/JudgeOnline/problem.php?id='+id
    tt=s.get(url).text
    ss=re.compile(r'</title><center><h2>(.+)</h2>',flags=re.DOTALL)
    si=re.compile(r'<span style="(.+?)<',flags=re.DOTALL)
    tit=ss.findall(tt)
    sis=si.findall(tt)
    print(tit)
    pt='00'
    try :
        pt=tit[0]
    except :
        pass
    for i in sis:
        try :
            print(i)
            pt=pt+i[18:]
        except:
            pass
    
    print(pt)
    return pt

def is_ac():
    url='http://acm.hnucm.edu.cn/JudgeOnline/status.php?user_id=201701020135'
    tt=s.get(url).text
    ss=re.compile(r'class=\'label label-(.*?)\'  title=\'',flags=re.DOTALL)
    ff=ss.findall(tt)[0]
    if(ff=='success'):
        print("ACACAC啦!!")
        return True
    else :
        print(ff)
        return False
    
def main():
    global cs
    global user
    user['id']=input('账号:')
    user['passw']=input('密码:')
    cs=csrf()
    login_oj(user['id'],user['passw'])
    for id in range(1100,1400):
        tit=get_pro(str(id))
        if(tit=='00'):
            continue
        ke=tit
        url = get_url(ke)
        results = parse_page(url,2)
        # 写入文件
        #file = open("data.json", 'w+', encoding='utf-8')
        for result in results:
            print(result)
            if(get_real(result['sub_url'])[0:21]=='https://blog.csdn.net'):
            #file.write(json.dumps(result, indent=2, ensure_ascii=False))
                try :
                    code=ans_code(result['sub_url'])
                    if(code!='nil'):
                        submit_code(code, str(id))
                        time.sleep(8)
                        if is_ac():
                            break;
                except:
                    pass
            
            
                


if __name__ == '__main__':
    main()