第一次爬虫测试

球赛测试：

# -*- coding: utf-8 -*-
"""
Created on Thu May 23 00:41:48 2019

@author: h2446
"""

from random import random
def printIntro():
    print("这个程序模拟两支排球队A和B的排球比赛")
    print("程序运行需要A和B的能力值(以0到1之间的小数表示)")
def getInputs():
    a=eval(input("请输入队伍A的能力值(0~1):"))
    b=eval(input("请输入队伍B的能力值(0~1):"))
    n=eval(input("模拟比赛的场次:"))
    return a,b,n
def simNGames(n,probA,probB):
    winsA,winsB=0,0
    for i in range(n):
        scoreA,scoreB=simOneGame(probA,probB)
        if scoreA>scoreB:
            winsA +=1
        else:
            winsB +=1
        return winsA,winsB
def gameOver(a,b):
        if (a>25 and abs(a-b)>=2 )or(b>25 and abs(a-b)>=2):
            return True
        else:
            return False
def simOneGame(probA,probB):
    scoreA,scoreB=0,0
    serving = "A"
    while not gameOver(scoreA,scoreB):
        if serving =="A":
            if random()<probA:
                scoreA +=1
            else:
                serving="B"
        else:
            if random()<probB:
                scoreB +=1
            else:
                serving="A"
    return scoreA,scoreB
def final(probA,probB):
     winsA,winsB=simNGames1(4,probA,probB)
     printSummary(winsA,winsB)
     if not winsA==3 or winsB==3:
         if winsA==winsB==2:
             winsA1,winsB1=simOneGame1(probA,probB)
             finalprintSummary(winsA,winsB)
     else:
         finalprintSummary(winsA,winsB)
def simNGames1(n,probA,probB):
     winsA,winsB=0,0
     for i in range(n):
         scoreA,scoreB=simOneGame2(probA,probB)
         if winsA==3 or winsB==3:
             break
         if scoreA>scoreB:
             winsA+=1
         else:
             winsB+=1
     return winsA,winsB
def simOneGame2(probA,probB):
     scoreA,scoreB=0,0
     serving="A"
     while not GG(scoreA,scoreB):
         if serving=="A":
             if random() < probA:
                 scoreA += 1
             else:
                 serving="B"
         else:
             if random() < probB:
                 scoreB += 1
             else:
                 serving="A"
     return scoreA,scoreB
def simOneGame1(probA,probB):
    scoreA,scoreB=0,0
    serving="A"
    while not finalGameOver(scoreA,scoreB):
        if serving=="A":
            if random() < probA:
                scoreA += 1
            else:
                  serving="B"
        else:
            if random() < probB:
                scoreB += 1
            else:
                serving="A"
                return scoreA,scoreB
def GG(a,b):
    return a==3 or b==3
def finalGameOver(a,b):
     if (a==8 or b==8):
         if a>b:
             print("A队获得8分，双方交换场地")
         else:
             print("B队获得8分，双方交换场地")
     if (a>15 and abs(a-b)>=2 )or(b>15 and abs(a-b)>=2):
         return True
     else:
         return False
def finalprintSummary(winsA,winsB):
     n=winsA+winsB
     if n>=4:
         print("进行最终决赛")
         if winsA>winsB:
             print("最终决赛由A获胜")
         else:
             print("最终决赛由B获胜")
     else:
            if winsA>winsB:
                print("最终决赛由A获胜")
            else:
                print("最终决赛由B获胜")
def printSummary(winsA,winsB):
        n=winsA+winsB
        print("竞技分析开始，共模拟{}场比赛".format(n))
        print("选手A获胜{}场比赛，占比{:0.1%}".format(winsA,winsA/n))
        print("选手B获胜{}场比赛，占比{:0.1%}".format(winsB,winsB/n))
def main():
        printIntro()
        probA,probB,n=getInputs()
        winsA,winsB=simNGames(n,probA,probB)
        printSummary(winsA,winsB)
        final(probA,probB)
try:
    main()
except:
    print("Error")

测试结果：

使用requests库中的get（）函数访问如下一个网站20次（这里采用必应网为例子），打印返回状态，text内容，计算text（）属性和content属性所返回网页内容的长度。

import requests
from bs4 import BeautifulSoup
def getHTMLText(r):
    x = r.status_code
    if x == 200 :
        print("访问成功")
    if x == 404:
        print("访问失败")
    return ''
r = requests.get("https://cn.bing.com/",timeout = 100)
for i in range(1,21):          
    print("第{}次".format(i))
    getHTMLText(r)
url="https://cn.bing.com/"
r=requests.get(url)
print(r.status_code)
r.encoding='UTF-8'
print(r.text)

print("text属性长度{}".format(len(r.text)))
print("content属性长度{}".format(len(r.content)))

返回状态：

text部分内容：

text（）属性和content属性所返回网页内容的长度：

html页面的简单相关计算

先将下面的代码保存到电脑上，并使用浏览器打开

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>菜鸟教程(runoob.com)</title>
</head>
<body>
    <h1>我的第一个标题</h1>
    <p id=first>我的第一个段落。</p >
</body>
        <table border="1">
    <tr>
        <td>班级</td>
        <td>2班</td>
    </tr>
    <tr>
        <td>我的学号</td>
        <td>3109</td>
    </tr>
</table>
</html>

import requests
from bs4 import BeautifulSoup
f = open("C:/Users/86183/Desktop/.html",'r',encoding="utf-8")
r = f.read()
f.close()
soup = BeautifulSoup(r,"html.parser")
print("(1)head标签:\n",soup.head)
print("(2)body标签内容:\n",soup.title)
print("(3)id为first的标签对象:\n",soup.find(id='first'))
print("(4)获取该html中的中文字符:",soup.find('h1').string,\
   soup.find(id='first').string)

运行截图如下：

爬取中国大学网站的相关内容，并保存为CSV格式，代码如下

import requests
from bs4 import BeautifulSoup
def GetHTMLText(url):
    try :
        r = requests.get(url,timeout = 30)
        r.raise_for_status()
        r.encoding = "utf-8"
        return  r.text
    except:
        return ""
def FindUnivList(soup):
    data = soup.find_all('tr')
    
    lth = data[0].find_all('th')          #通过查看网页html代码，找到第一个'tr'标签下的'th'标签就是表格标题。
    thead = []
    for th in lth[:4]:                      #前四个表格标题（第五个特殊，下面处理）
        thead.append(th.string)
    tOptions = data[0].find_all('option')     #处理第五个特殊的表格标题
    for tOption in tOptions:                 
        thead.append(tOption.string)
    allUniv.append(thead)
    
    for tr in data:                         #获取数据内容
        ltd = tr.find_all('td')
        if len(ltd) ==  0:
            continue
        tbodies = []
        for td in ltd:
            if td.string is None:           #对无数据内容处理
                tbodies.append('')
                continue
            tbodies.append(td.string)
        allUniv.append(tbodies)


def main():
    url = "http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html"
    html = GetHTMLText(url)
    soup = BeautifulSoup(html,"html.parser")
    FindUnivList(soup)
    #储存为csv
    f = open('2019中国大学排名.csv','w',encoding='utf-8')
    for row in allUniv:
        f.write(','.join(row)+'\n')
    f.close()

allUniv = []
main()

View Code

截图如下：

posted @ 2020-05-17 23:34 Xiao_kong 阅读(198) 评论(0) 收藏举报

刷新页面返回顶部

第一次爬虫测试

公告