python123——爬虫作业

Posted on 2023-12-18 19:41  奇诺qwq  阅读(26)  评论(0编辑  收藏  举报

 

用requests库的get()函数访问百度主页20次

import requests

url = 'https://www.baidu.com'
for i in range(20):
    response = requests.get(url)
    response.encoding = 'utf-8'#加编码方式,防止乱码
    print(f"第{i+1}次访问")
    print(f'Response status: {response.status_code}')
    print(f'Text content length: {len(response.text)}')
    print(f'Content length: {len(response.content)}')
    print(response.text)

输出结果

 

创建html文件(文件名:test1.html,路径:D:\前端学习\其他练习\test1.html)

<!DOCTYPE html>
<html>

<head>
    <meta charset="utf-8">
    <title>菜鸟教程(runoob.com)</title>
</head>

<body>
    <h1>我的学号:11</h1>
    <h1>我的第一个标题</h1>
    <p id="first">我的第一个段落。</p>
</body>
<table border="1">
    <tr>
        <td>row 1, cell 1</td>
        <td>row 1, cell 2</td>
    </tr>
    <tr>
        <td>row 2, cell 1</td>
        <td>row 2, cell 2</td>
    </tr>
</table>

</html>

代码

import re
from bs4 import BeautifulSoup
import requests
with open('D:\\前端学习\\其他练习\\test1.html', 'r', encoding='utf-8') as file:
    r = file.read()

demo = BeautifulSoup(r,'html.parser')

print(demo.title)
print("")
print(demo.body)
print("")
# 获取id为"first"的标签对象
first_tag = demo.find(id="first")
# 打印标签对象
print(first_tag)
print("")

# 使用正则表达式匹配只包含中文字符的文本
pattern = re.compile('[\u4e00-\u9fa5]+')
result = pattern.findall(demo.get_text())
# 打印只包含中文字符的文本
for text in result:
    print(text)

输出结果

 

爬取2020年大学排名

import requests
from bs4 import BeautifulSoup
import bs4
import csv


def getHTMLText(url):
     try:
          r=requests.get(url)
          r.raise_for_status()
          r.encoding=r.apparent_encoding
          return r.text
     except:
          return '获取网页失败'
     

def catchData(ulist,html):
     soup=BeautifulSoup(html,'html.parser')
     for tr in soup.find('tbody').children:
          if isinstance(tr,bs4.element.Tag):
               tds=tr('td')
               ulist.append([tds[0].text.strip(),tds[1].text.strip(),tds[2].text.strip(),tds[3].text.strip(),tds[4].text.strip()])
     

def writeList(ulist,num,year):
     f = open(year+'年中国大学排名.csv','w',encoding='gb2312',newline='')
     csv_writer = csv.writer(f)
     csv_writer.writerow(['排名','名称','省份','类型','总分'])
     for i in range(num):
          u=ulist[i]
          csv_writer.writerow([u[0],u[1],u[2],u[3],u[4]])
     f.close()
     print(year+'年中国大学排名爬取成功')
          
          
if __name__ == '__main__':
     year=2020
     uinfo=[]
     url = 'https://www.shanghairanking.cn/rankings/bcur/'+str(year)+'11'
     html = getHTMLText(url)
     catchData(uinfo,html)
     writeList(uinfo,20,str(year))     

输出结果