简单爬虫
一.简单爬虫
目标——写一个爬虫来爬取常用搜索引擎(比如百度、搜狗等)的首页。
手段——使用 python 的 requests 、BeautifulSoup4 与 lxml 库。
实现方式——
首先,我们要获取到网页,这个不难,可以使用 requests.get() 解决,如下:
# coding = utf-8 import requests url = "http://www.baidu.com" #设置获取网页的时间限制,超时就报错,防止网络延时而太久没反馈 r = requests.get(url, timeout = 1) #如果返回状态码为 200,则说明网页连接成功 print(r.status_code)
在 windows 10 的 cmd 下,运行结果如下
# encoding = utf-8 import requests from lxml import html from bs4 import BeautifulSoup # 教训不要给文件取名与库名相同,否则会给程序扫描文件出错(找库时出错) url = [ "http://www.baidu.com", "http://www.google.com", "http://www.sogou.com", "http://www.bing.com", "http://www.so.com" ] name = ["baidu", "google", "sogou", "bing", "360"] ''' for i in range(5): r = requests.get( url[i], timeout = 10) r.encoding = 'utf-8' tree = html.fromstring(r.text) urls = [] if r.status_code == 200: #create html doc and save it with open("D:\\{}.html".format(name[i]), 'w', encoding='utf-8') as f: f.write(r.text) print("This is {} times: Successful!".format(i)) for i in tree.xpath("//@href"): urls.append(i) for i in range(len(urls)): print(urls[i]) else: print("This is {} times: False!".format(i)) for i in range(20): r = requests.get(url[0], timeout = 1) print(r.status_code) r.encoding = 'utf-8' print("type of text: ", type(r.text)) print("type of content: ", type(r.content)) soup = BeautifulSoup(r) print(soup.get_text()) #print(r.text)
处理一个 HTML 文件
a.打印 body 标签的内容
b.获取 body 标签的内容
c.获取 id 为 first 的标签对象
d.获取并打印 HTML 页面的中文字符
from lxml import html
from bs4 import BeautifulSoup
html_doc = """ <!DOCTYPE html> <html> <head> <meta charset="utf-8"> <title>菜鸟教程(runoob.com)</title> </head> <body> <h1>我的第一个标题</h1> <p>我的第一个段落。</p> </body> <table border="1"> <tr> <td>row 1, cell 1</td> <td>row 1, cell 2</td> </tr> <tr> <td>row 2, cell 1</td> <td>row 2, cell 2</td> </tr> </table> </html> """ #使用 BeautifulSoup 解析网页,并得到一个 BeautifulSoup 的对象 soup = BeautifulSoup(html_doc) #输出网页源码 print(soup.prettify()) #输出网页中的文本信息 #print(soup.get_text()) text = soup.get_text() print("---------------") #print() #输出网页中文本信息的长度(行数) print(len(soup.contents))
牛刀小试
爬取中国大学排名内容,http://www.zuihaodaxue.com/zuihaodaxuepaiming2018.html
爬取2015年的信息。
# coding = utf-8 import requests from bs4 import BeautifulSoup import pandas as pd client = MongoClient() db = client['UnivRanking'] collection = db['UnivRanking'] def save_to_mongo(result): try: if collection.insert(result): print('Save to Mongo') except: print("错误") allUniv = [] def getHTMLText(url): try: r = requests.get(url, timeout = 30) r.raise_for_status() r.encoding = 'utf-8' return r.text except: return "" def fillUnivList(soup): data = soup.find_all('tr') for tr in data: ltd = tr.find_all('td') if len(ltd) == 0: continue singleUniv = [] for td in ltd: singleUniv.append(td.string) allUniv.append(singleUniv) def printUnivList(num): print("{1:^4}{2:{0}^10}{3:{0}^5}{4:{0}^8}{5:{0}^8}{6:{0}^8}{7:{0}^8}".format(chr(12288), "排名", "学校名称", "省市", "总分", "人才培养得分", "科学研究得分", "社会服务得分")) for i in range(num): u = allUniv[i] print("{1:{0}^4}{2:{0}^10}{3:{0}^5}{4:{0}^8}{5:{0}^10}{6:{0}^10}{7:{0}^10}".format(chr(12288), u[0], u[1], u[2], u[3], u[4], u[5], u[6])) name = ["排名", "学校名称", "省市", "总分", "人才培养得分", "科学研究得分", "社会服务得分"] test=pd.DataFrame(columns=name, data= allUniv) print(test) test.to_csv('testcsv.csv',encoding='utf-8') def main(num): url = "http://www.zuihaodaxue.cn/zuihaodaxuepaiming2015_0.html" html = getHTMLText(url) soup = BeautifulSoup(html, "html.parser") fillUnivList(soup) printUnivList(num) save_to_mongo(allUniv) if __name__ == "__main__": main(100)