爬虫入门
1、搜索结果爬取(未解析)
#coding:utf-8 import requests url="http://www.baidu.com/s" try: #kv={'user-agent':'Mozilla/5.0'} kv={'wd':'Python'} r=requests.get(url,params=kv) print r.status_code r.raise_for_status() r.encoding=r.apparent_encoding print len(r.text) except: print '产生异常'
2、爬取图片
#coding:utf-8 import requests import os url="http:image.nationalgeographic.com.cn/2017/0211/20170211061910157.jpg" root="/Users/wangkun/Desktop/DEMO1/" path=root+url.split('/')[-1] try: if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): r=requests.get(url) with open(path,'wb') as f: f.write(r.content) f.close() print '文件保存成功' else: print '文件已存在' except: print '爬取失败'
3、IP归属地查询
#coding:utf-8 import requests url="http://m.ip138.com/ip.asp?ip=" try: r=requests.get(url+'202.204.80.112') r.raise_for_status() r.encoding=r.apparent_encoding print r.text[-500:] except: print '爬取失败'
4、beautifulsoup解析
import requests from bs4 import BeautifulSoup r=requests.get('http://python123.io/ws/demo.html') demo=r.text soup=BeautifulSoup(demo,'html.parser') soup.prettify()# for link in soup.find_all('a'): print(link.get('href')) print(soup.a.next_siblings)
5、爬取并解析大学排名,print输出(数据结构结果)
#coding:utf-8 import requests from bs4 import BeautifulSoup import bs4 import re def getHTMLText(url): try: r=requests.get(url,timeout=30) r.raise_for_status() r.encoding=r.apparent_encoding #print(r.text) return r.text except: return '' def fillList(ulist,html): soup=BeautifulSoup(html,'html.parser') for tr in soup.find('tbody').children: if isinstance(tr,bs4.element.Tag): tds=tr('td') ulist.append([tds[0].string,tds[1].string,tds[3].string]) #print(ulist) def printList(ulist,num): print("{:^10}\t{:^6}\t{:^10}".format('排名','学校名称','总分')) for i in range(num): u=ulist[i] print("{:^10}\t{:^6}\t{:^10}".format(u[0], u[1], u[2])) return("Suc"+str(num)) uinfo=[] url="http://www.zuihaodaxue.cn/shengyuanzhiliangpaiming2018.html" html=getHTMLText(url) fillList(uinfo,html) printList(uinfo,20) #20所大学的信息