第一个爬虫和测试
import requests from bs4 import BeautifulSoup allUniv = [] def getHTMLText(url): try: r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = 'utf-8' return r.text except: return "" def fillUnivList(soup): data = soup.find_all('tr') for tr in data: ltd = tr.find_all('td') if len(ltd)==0: continue singleUniv = [] for td in ltd: singleUniv.append(td.string) allUniv.append(singleUniv) def printUnivList(num): print("{:^4}{:^10}{:^5}{:^8}{:^10}".format("排名","学校名称","省市","总分","培养规模")) for i in range(num): u=allUniv[i] print("{:^4}{:^10}{:^5}{:^8}{:^10}".format(u[0],u[1],u[2],u[3],u[6])) def main(): url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html' html = getHTMLText(url) soup = BeautifulSoup(html, "html.parser") fillUnivList(soup) printUnivList(10) main()
^大学的排名
|
1、调试排球比赛程序:
# -*- coding: utf-8 -*- """ Created on Mon May 20 10:27:15 2019 @author: history """ # 比赛规则: # 1. 采用5局3胜制 # 2. 前四局采用25分制,每个队只有在赢得至少25分,且同时超过对方2分时才胜一局 # 3. 决胜局(第五局)采用15分制,先获得15分,且同时超过对方2分为胜 from random import random def printInfo(): ''' 打印程序的功能信息 ''' print("\t\t这个程序模拟2个队伍A和B的排球竞技比赛!") print("\t 程序运行需要队伍A和B的能力值(0到1之间的小数表示)") def getInputs(): ''' 获得用户输入的参数 ''' a = eval(input("请输入队伍A的能力值(0~1):")) b = eval(input("请输入队伍B的能力值(0~1):")) n = eval(input("请输入比赛次数:")) return a, b, n def NGames(n, probA, probB): ''' 模拟n场比赛 ''' winA, winB = 0, 0 for _ in range(n): scoreA, scoreB = OneGame(probA, probB) if scoreA > scoreB:#一场得分 winA += 1 else: winB += 1 return winA, winB #赢的场数 def OneGame(probA, probB): ''' 模拟一场比赛,包括五局 ''' scoreA, scoreB, N = 0, 0, 0 serving = 'A' while not GameOver(N, scoreA, scoreB): if serving == 'A': if random() > probA: scoreB += 1 serving = 'B' else: scoreA += 1 if serving == 'B': if random() > probB: scoreA += 1 serving = 'A' else: scoreB += 1 N += 1 return scoreA, scoreB def GameOver(N, a, b): ''' 定义赢得一局的条件 N: 当前局次(第五局为决胜局) ''' if N <= 4: return (a>=25 and b>=25 and abs(a-b)>=2) else: return (a>=15 and b>=15 and abs(a-b)>=2) def printResult(n, winA, winB): print("竞技分析开始,共模拟{}场比赛".format(n)) print("队伍A获胜{}场比赛,占比{:0.1%}".format(winA,winA/n)) print("队伍B获胜{}场比赛,占比{:0.1%}".format(winB,winB/n)) if __name__ == "__main__": printInfo() probA, probB, n = getInputs() winA, winB = NGames(n, probA, probB) printResult(n, winA, winB)
2、爬虫的小测试
1
import requests r=requests.get("http://www.baidu.com") '''type(r) < class'requests.models.Response'>''' print(r.status_code) print(r.text) print(r.encoding) r.encoding='utf-8' print(r.text)
你们能看的懂就行,反正我看不懂
3、打开Google网页20次:
先打开百度20次:
import requests def getHTMLText(url): try: r=requests.get(url,timeout=30) r.raise_for_status() r.encoding='utf-8' print(r.status_code) print(r.text) print(r.encoding) print(r.text) except: return "" for i in range(0,20): url="https://www.baidu.com" getHTMLText(url)
打开Google:
import requests def getHTMLText(url): try: r=requests.get(url,timeout=30) r.raise_for_status() r.encoding='utf-8' print(r.status_code) print(r.text) print(r.encoding) print(r.text) except: return "" for i in range(0,20): url="https://www.Google.com" getHTMLText(url)
毫无反应,可能是我网址错了,但不是我的代码问题,我能打开百度20次!!
import requests #from bs4 import BeautifulSoup #allUniv=[] def getHTMLText(url): try: r=requests.get(url,timeout=30) r.raise_for_status() r.encoding='utf-8' print(r) #print( r.raise_for_status) return r.text except: return "" url="http://www.zuihaodaxue.cn" getHTMLText(url)
4、 HTML的书写:
<!DOCTYPE html> <html> <head> <meta charset="utf-8"> <title>菜鸟教程(runboo.com) 26 </title> </head> <body> <h1>我的第一个标题</h1> <p id="first">我的第一个段落。</p> </body> <table border="1"> <tr> <td>row 1,cell 1</td> <td>row 1,cell 2</td> </tr> <tr> <td>row 2,cell 1</td> <td>row 2,cell 2</td> </tr> </table> </html>
这是普通页面信息:
代码如下:
from bs4 import BeautifulSoup import re soup=BeautifulSoup("<html><head><title>Simpke</title></head><body><p id='china'>你好,中国</p></body></html>","html.parser") print(soup.head,"26") #打印head的内容和我的学号后两位 print(soup.body) #打印body的内容 print(soup.find_all(id="china")) #打印id为china的文本 r=soup.text pattern = re.findall(u'[\u1100-\uFFFDh]+?',r) print(pattern)
任务完成、