python爬虫前提技术
1、BeautifulSoup 解析html如何使用 转自:http://blog.csdn.net/u013372487/article/details/51734047 #!/usr/bin/python # -*- coding: UTF-8 -*- from bs4 import BeautifulSoup import re #待分析字符串 html_doc = """ <html> <head> <title>The Dormouse's story</title> </head> <body> <p class="title aq"> <b> The Dormouse's story </b> </p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well. </p> <p class="story">...</p> """ # html字符串创建BeautifulSoup对象 soup = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8') #输出第一个 title 标签 print soup.title #输出第一个 title 标签的标签名称 print soup.title.name #输出第一个 title 标签的包含内容 print soup.title.string #输出第一个 title 标签的父标签的标签名称 print soup.title.parent.name #输出第一个 p 标签 print soup.p #输出第一个 p 标签的 class 属性内容 print soup.p['class'] #输出第一个 a 标签的 href 属性内容 print soup.a['href'] ''' soup的属性可以被添加,删除或修改. 再说一次, soup的属性操作方法与字典一样 ''' #修改第一个 a 标签的href属性为 http://www.baidu.com/ soup.a['href'] = 'http://www.baidu.com/' #给第一个 a 标签添加 name 属性 soup.a['name'] = u'百度' #删除第一个 a 标签的 class 属性为 del soup.a['class'] ##输出第一个 p 标签的所有子节点 print soup.p.contents #输出第一个 a 标签 print soup.a #输出所有的 a 标签,以列表形式显示 print soup.find_all('a') #输出第一个 id 属性等于 link3 的 a 标签 print soup.find(id="link3") #获取所有文字内容 print(soup.get_text()) #输出第一个 a 标签的所有属性信息 print soup.a.attrs for link in soup.find_all('a'): #获取 link 的 href 属性内容 print(link.get('href')) #对soup.p的子节点进行循环输出 for child in soup.p.children: print(child) #正则匹配,名字中带有b的标签 for tag in soup.find_all(re.compile("b")): print(tag.name) 2、cookie等使用方法以及函数爬虫 参照: https://cuiqingcai.com/968.html 3、header,代理,超时,认证,异常处理 参照: http://blog.csdn.net/m_buddy/article/details/55193762 4、错误异常处理 1.URLError # -*- coding: UTF-8 -*- import urllib import urllib from urllib import request import re import requests import urllib.parse import urllib.request from urllib.request import Request, urlopen from urllib.error import URLError, HTTPError if __name__ == "__main__": #一个不存在的连接 url = "http://www.douyu.com/Jack_Cui.html" request = urllib.request.Request(url) try: response = urllib.request.urlopen(request) # html = responese.read() except urllib.error.HTTPError as e: print(e.code) 运行结果: C:\Python34\python.exe G:/xiaoshuo2.py 403 Process finished with exit code 0 # -*- coding: UTF-8 -*- import urllib import urllib from urllib import request import re import requests import urllib.parse import urllib.request from urllib.request import Request, urlopen from urllib.error import URLError, HTTPError if __name__ == "__main__": #一个不存在的连接 url = "http://www.douyu.com/Jack_Cui.html" request = urllib.request.Request(url) try: response = urllib.request.urlopen(request) html = response.read().decode('utf-8') print(html) except urllib.error.HTTPError as e: print(e.code) 运行结果: C:\Python34\python.exe G:/xiaoshuo2.py 403 Process finished with exit code 0 import urllib import urllib from urllib import request import re import requests import urllib.parse import urllib.request from urllib.request import Request, urlopen from urllib.error import URLError, HTTPError url = "http://www.douyu.com/Jack_Cui.html" rep=urllib.request.Request(url) try: data=urllib.request.urlopen(rep) except urllib.error.URLError as e: if hasattr(e,'code'): print("HTTPError") print(e.code) if hasattr(e,'reason' ): print("URLError") print(e.reason) 输出结果: C:\Python34\python.exe G:/xiaoshuo2.py HTTPError 403 URLError Forbidden Process finished with exit code 0 5、python打印防止换行和换行 https://www.cnblogs.com/kfx2007/p/5970784.html 实例: # coding=utf-8 import re language = ''''' <table class="infobox bordered vcard" style="width: 21em; font-size: 89%; text-align: left;" cellpadding="3"> <caption style="text-align: center; font-size: larger;" class="fn"><b>jenkins</b></caption> <tr> <th>性別:</th> <td>男</td>d </tr> <tr> <th>異名:</th> <td><span class="nickname">(字) 翔宇</span></td> </tr> <tr> <th>爱好:</th> <td><span class="org"><a href="../articles/%E4%B8%AD9A.html" title="篮球">篮球</a></span></td> </tr> <tr> <th>籍貫:</th> <td><a href="../articles/%E6%B5%9981.html" title="广西省">广西省</a><a href="../articles/%E7%BB%8D82.html" title="桂林市">桂林市</a></td> </tr> </table> ''' #获取table中tr值 res_tr = r'<tr>(.*?)</tr>' m_tr = re.findall(res_tr,language,re.S|re.M) for line in m_tr: #获取表格第一列th 属性 res_th = r'<th>(.*?)</th>' m_th = re.findall(res_th,line,re.S|re.M) for mm in m_th: if "href" in mm: #如果获取加粗的th中含超链接则处理 restr = r'<a href=.*?>(.*?)</a>' h = re.findall(restr,mm,re.S|re.M) print (h[0],end=' ') #逗号连接属性值 防止换行 else: print (mm,end=' ') #unicode防止乱 #获取表格第二列td 属性值 res_td = r'<td>(.*?)</td>' #r'<td .*?>(.*?)</td>' m_td = re.findall(res_td,line,re.S|re.M) for nn in m_td: if "href" in nn: #处理超链接<a href=../rel=..></a> res_value = r'<a .*?>(.*?)</a>' m_value = re.findall(res_value,nn,re.S|re.M) for value in m_value: print (value,end=' ') elif "span" in nn: #处理标签<span> res_value = r'<span .*?>(.*?)</span>' m_value = re.findall(res_value,nn,re.S|re.M) #<td><span class="nickname">(字) 翔宇</span></td> for value in m_value: print (value,end=' ') else: print (nn,end=' ') print (' ') #换行 C:\Python34\python.exe G:/xiaoshuo2.py 性別: 男 異名: (字) 翔宇 爱好: 篮球 籍貫: 广西省 桂林市 6、python打印如何呢不换行 https://www.cnblogs.com/hwd9654/p/5707920.html # -*- coding:utf-8 -*- import urllib import re #import requests import urllib.parse import urllib.request from urllib.request import Request, urlopen from urllib.error import URLError, HTTPError class Tool: removeImg = re.compile('<img.*?>| {7}|') removeAddr = re.compile('<a.*?>|</a>') replaceLine = re.compile('<tr>|<div>|</div>|</p>') replaceTD= re.compile('<td>') replacePara = re.compile('<p.*?>') replaceBR = re.compile('<br><br>|<br>') removeExtraTag = re.compile('<.*?>') def replace(self,x): x = re.sub(self.removeImg,"",x) x = re.sub(self.removeAddr,"",x) x = re.sub(self.replaceLine,"\n",x) x = re.sub(self.replaceTD,"\t",x) x = re.sub(self.replacePara,"\n",x) x = re.sub(self.replaceBR,"\n",x) x = re.sub(self.removeExtraTag,"",x) return x.strip() class BDTB: def __init__(self,baseUrl,seeLZ): self.baseURL = baseUrl self.seeLZ = '?see_lz='+str(seeLZ) self.tool = Tool() def getPage(self,pageNum): try: url = self.baseURL+ self.seeLZ + '&pn=' + str(pageNum) request = urllib.request.Request(url) response = urllib.request.urlopen(request).read().decode("utf8") #print (response) return response except urllib.error.URLError as e: if hasattr(e,"reason"): print ("连接百度贴吧失败,错误原因",e.reason) return None def getTitle(self): page = self.getPage(1) pattern = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>',re.S) result = re.search(pattern,page) if result: #print (result.group(1)) return result.group(1).strip() else: return None def getPageNum(self): page = self.getPage(1) pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>',re.S) result = re.search(pattern,page) #print (result.group(1)) if result: return result.group(1).strip() else: return None def getContent(self,page): pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S) items = re.findall(pattern,page) floor = 1 for item in items: print (floor,"楼-------------------------------------------------------------------------------------\n",end='') #print ("楼---------------------------------------------------------------------------------------------------------------\n") print (self.tool.replace(item)) floor += 1 baseURLh = 'http://tieba.baidu.com/p/3138733512' bdtb = BDTB(baseURLh,1) bdtb.getContent(bdtb.getPage(1)) 打印结果: C:\Python34\python.exe C:/Users/Administrator/ceshi.py 1 楼------------------------------------------------------------------------------------- 很多媒体都在每赛季之前给球员排个名,我也有这个癖好…………,我会尽量理性的分析球队地位,个人能力等因素,评出我心目中的下赛季50大现役球员,这个50大是指预估他本赛季在篮球场上对球队的影响力……不是过去的荣誉什么的,所以难免有一定的主观性……如果把你喜欢的球星排低了,欢迎理性讨论! 状元维金斯镇楼 P.S 1 我每天都至少更新一个,不TJ。 2 今年的新秀我就不考虑了,没上赛季参照 2 楼------------------------------------------------------------------------------------- 50 惊喜新人王 迈卡威 上赛季数据 篮板 6.2 助攻 6.3 抢断 1.9 盖帽 0.6 失误 3.5 犯规 3 得分 16.7 新赛季第50位,我给上赛季的新人王迈卡威。 上赛季迈卡威在彻底重建的76人中迅速掌握了球队,一开始就三双搞定了热火赢得了万千眼球。后来也屡屡有经验的表现,新秀赛季就拿过三双的球员不多,迈卡威现在可以说在76人站稳了脚跟。 7、python爬虫xpath的语法 http://www.cnblogs.com/lonenysky/p/4649455.html //*[@id="AD_4586850"]/div[1]/strong/i //*[@id="shop_list"]/div[1]/strong/i //*[@id="shop_list"] 8、requests用法 http://cuiqingcai.com/2556.html #-*—coding:utf8-*- from lxml import etree import requests import re #编码转换 import sys #headers构造一个字典,里面保存了user-agent #headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } html = requests.get('http://cuiqingcai.com') print(html.text) 9、sub使用 http://blog.csdn.net/lovemianmian/article/details/8867613 1、去除imag标签 import re text ='<imgJGood is a handsome boy,> he is cool, clever, and so on...' removeImg = re.compile('<img.*?>') s=re.sub(removeImg,"",text).strip() print (s) C:\Python34\python.exe G:/xiaoshuo2.py he is cool, clever, and so on... 1、1 单独去除7位长空格 import re text ='<imgJGood is a handsome boy,> he is cool, clever, and so on...' removeImg = re.compile('| {7}|') s=re.sub(removeImg,"",text).strip() print (s) 打印 C:\Python34\python.exe G:/xiaoshuo2.py <imgJGood is a handsome boy,> he is cool, clever, and so on... 2、去除imag标签 + 去除7位长空格 import re text ='<imgJGood is a handsome boy,> he is cool, clever, and so on...' removeImg = re.compile('<img.*?>| {7}|') s=re.sub(removeImg,"",text).strip() print (s) 打印: C:\Python34\python.exe G:/xiaoshuo2.py he is cool, clever, and so on... Process finished with exit code 0 3、去除imag标签 + 保留7位长空格 import re text ='<imgJGood is a handsome boy,> he is cool, clever, and so on...' removeImg = re.compile('<img.*?>{7}') s=re.sub(removeImg,"",text).strip() print (s) 打印: C:\Python34\python.exe G:/xiaoshuo2.py <imgJGood is a handsome boy,> he is cool, clever, and so on... Process finished with exit code 0 4、把两个标签中间的内容去掉 import re text='<a href="http://jump2.bdimg.com/safecheck/index?url=x+Z5)">迈卡威</a>刷出了不错的数据' removeImg = re.compile('<a.*?>|</a>') s=re.sub(removeImg,"",text).strip() print (s) 打印: C:\Python34\python.exe G:/xiaoshuo2.py 迈卡威刷出了不错的数据 5,把<br>换行符换成/n 换行符 import re text ='height="510"><br><br><br><br>状元维金斯镇楼<br>P.S 1 我每天都至少更新一个,不TJ。<br> 2 今年的新秀我就不考虑了,没上赛季参照' removeImg = re.compile('<br><br>|<br>') s=re.sub(removeImg,"\n",text).strip() print (s) C:\Python34\python.exe G:/xiaoshuo2.py height="510"> 状元维金斯镇楼 P.S 1 我每天都至少更新一个,不TJ。 2 今年的新秀我就不考虑了,没上赛季参照 5.1,把<br>换行符换成/n 换行符 import re text ='height="510"><br><br><br><br>状元维金斯镇楼<br>P.S 1 我每天都至少更新一个,不TJ。<br> 2 今年的新秀我就不考虑了,没上赛季参照' removeImg = re.compile('<br>') s=re.sub(removeImg,"\n",text).strip() print (s) C:\Python34\python.exe G:/xiaoshuo2.py height="510"> 状元维金斯镇楼 P.S 1 我每天都至少更新一个,不TJ。 2 今年的新秀我就不考虑了,没上赛季参照 10、正则表达式 <div class="list-item"> <div class="personal-info"> <div class="pic-word"> <div class="pic s60"> <a href="//mm.taobao.com/687471686.htm" target="_blank" class="lady-avatar"> <img src="//gtd.alicdn.com/sns_logo/i2/TB1XZ1PQVXXXXaJXpXXSutbFXXX.jpg_60x60.jpg" alt="" width="60" height="60"/> </a> </div> <p class="top"> <a class="lady-name" href="//mm.taobao.com/self/model_card.htm?user_id=687471686" target="_blank">jenkins</a> <em><strong>27</strong>岁</em> <span>广州市</span> pattern = re.compile('<div class="list-item">.*? href="(.*?)".*? src="(.*?)".*? target="_blank">(.*?)</a>.*?<strong>(.*?)</strong>.*?<span>(.*?)</span>',re.S) 或者 pattern = re.compile('<div class="list-item">.*?<a href="(.*?)".*?<img src="(.*?)".*?<a class="lady-name".*?>(.*?)</a>.*?<strong>(.*?)</strong>.*?<span>(.*?)</span>',re.S) https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gbk&word=%C3%C0%C5%AE%CD%BC%C6%AC&fr=ala&ala=1&alatpl=cover&pos=0&hs=2&xthttps=111111