初学python爬虫,记录一下第一个程序

-------------- 更新一波 version : 2.0.0 ----------------------------------

现在不残疾了。。。

""" 
    2019/10/17
    version: 2.0.0
    by Zeronera
    实现对NITOJ题面,输入输出的简单爬取
"""

import requests
from bs4 import BeautifulSoup
import os
from fake_useragent import UserAgent


def getHTMLText(url):

    try:
        headers = {'User-Agent': UserAgent().random}  # 获取随机user-agent
        r = requests.get(url, timeout=30, headers=headers)
        r.raise_for_status()       # 如果状态不是200 引发HTTPError异常
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "error"


def get_problem_text(id):

    url = 'https://www.nitacm.com/problem_show.php?pid=' + id
    r = getHTMLText(url)
    if r == "error":
        print("Invalid ID!!!")
        return "Invalid ID!!!"
    soup = BeautifulSoup(r, 'lxml')
    div_list = soup.find_all('div', {'class': 'content-wrapper well'})
    if len(div_list) < 3:
        print("Invalid ID!!!")
        return "Invalid ID!!!"
    print("\n题面描述:\n\t"+div_list[0].text.strip()+'\n')
    print("Input:\n\t"+div_list[1].text.strip()+'\n')
    print("Output:\n\t"+div_list[2].text.strip()+'\n')


def main():

    while True:
        problem_id = input('题目编号[100-22815]:')
        get_problem_text(problem_id)
        print('\n')
        t = input("输入exit退出程序,输入其他字符继续运行程序:")
        if t == 'exit':
            break
        else:
            os.system("cls")


if __name__ == '__main__':
    main()

 

 

这个程序是实现对NITOJ题面信息的简单爬取

因为有些题目的题面,input,output的描述是用<p>标签,有些用<div>标签,有些不用标签,直接写在大的<div>下的,而这程序只识别<p>标签下的文本,所以就有了那个奇怪的话=。=(对不起,请换一个题)。

因为初学,所以程序比较残疾

因为是第一次实现,所以想记录一下

路过大佬请无视

""" 
    2019/8/9
    version: 1.0.0
    by Zeronera
    实现对NITOJ题面,输入输出的简单爬取
"""

import requests
from bs4 import BeautifulSoup


def get_problem_text(id):

    url = 'https://www.nitacm.com/problem_show.php?pid=' + id
    r = requests.get(url, timeout=30)
    soup = BeautifulSoup(r.text, 'lxml')
    div_list = soup.find_all('div', {'class': 'content-wrapper well'})
    problem_content = div_list[0].find_all('p')
    flag = 0
    print("题面描述:")
    for i in problem_content:
        print(i.text)
        flag = 1
    problem_input = div_list[1].find_all('p')
    print("Input:")
    for i in problem_input:
        print(i.text)
        flag = 1
    problem_output = div_list[2].find_all('p')
    print("Output:")
    for i in problem_output:
        print(i.text)
        flag = 1
    if flag == 0:
        print("对不起,请换一个题")


def main():

    while True:
        problem_id = input('题目编号[100-22815]:')
        get_problem_text(problem_id)
        print('\n')
        t = input("输入exit退出程序,输入其他字符继续运行程序:")
        if t == 'exit':
            break


if __name__ == '__main__':
    main()

 

posted @ 2019-08-09 11:14  Zeronera  阅读(239)  评论(0编辑  收藏  举报