python爬虫前提技术

1、BeautifulSoup 解析html如何使用

转自：http://blog.csdn.net/u013372487/article/details/51734047


#!/usr/bin/python
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import re

#待分析字符串
html_doc = """
<html>
<head>
    <title>The Dormouse's story</title>
</head>
<body>
<p class="title aq">
    <b>
        The Dormouse's story
    </b>
</p>

<p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
    and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.
</p>

<p class="story">...</p>
"""


# html字符串创建BeautifulSoup对象
soup = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8')

#输出第一个 title 标签
print soup.title

#输出第一个 title 标签的标签名称
print soup.title.name

#输出第一个 title 标签的包含内容
print soup.title.string

#输出第一个 title 标签的父标签的标签名称
print soup.title.parent.name

#输出第一个  p 标签
print soup.p

#输出第一个  p 标签的 class 属性内容
print soup.p['class']

#输出第一个  a 标签的  href 属性内容
print soup.a['href']
'''
soup的属性可以被添加,删除或修改. 再说一次, soup的属性操作方法与字典一样
'''
#修改第一个 a 标签的href属性为 http://www.baidu.com/
soup.a['href'] = 'http://www.baidu.com/'

#给第一个 a 标签添加 name 属性
soup.a['name'] = u'百度'

#删除第一个 a 标签的 class 属性为
del soup.a['class']

##输出第一个  p 标签的所有子节点
print soup.p.contents

#输出第一个  a 标签
print soup.a

#输出所有的  a 标签，以列表形式显示
print soup.find_all('a')

#输出第一个 id 属性等于  link3 的  a 标签
print soup.find(id="link3")

#获取所有文字内容
print(soup.get_text())

#输出第一个  a 标签的所有属性信息
print soup.a.attrs


for link in soup.find_all('a'):
    #获取 link 的  href 属性内容
    print(link.get('href'))

#对soup.p的子节点进行循环输出    
for child in soup.p.children:
    print(child)

#正则匹配，名字中带有b的标签
for tag in soup.find_all(re.compile("b")):
    print(tag.name)



2、cookie等使用方法以及函数爬虫

参照： https://cuiqingcai.com/968.html


3、header,代理,超时,认证,异常处理

参照：  http://blog.csdn.net/m_buddy/article/details/55193762


4、错误异常处理


1.URLError

# -*- coding: UTF-8 -*-
import urllib
import urllib
from urllib import request
import re
import requests
import urllib.parse
import urllib.request
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError

if __name__ == "__main__":
    #一个不存在的连接
    url = "http://www.douyu.com/Jack_Cui.html"
    request = urllib.request.Request(url)
    try:
        response = urllib.request.urlopen(request)
        # html = responese.read()
    except urllib.error.HTTPError as e:
        print(e.code)


运行结果：

C:\Python34\python.exe G:/xiaoshuo2.py
403

Process finished with exit code 0






# -*- coding: UTF-8 -*-
import urllib
import urllib
from urllib import request
import re
import requests
import urllib.parse
import urllib.request
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError

if __name__ == "__main__":
    #一个不存在的连接
    url = "http://www.douyu.com/Jack_Cui.html"
    request = urllib.request.Request(url)
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode('utf-8')
        print(html)
    except urllib.error.HTTPError as e:
        print(e.code)


运行结果：

C:\Python34\python.exe G:/xiaoshuo2.py
403


Process finished with exit code 0





import urllib
import urllib
from urllib import request
import re
import requests
import urllib.parse
import urllib.request
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError

url = "http://www.douyu.com/Jack_Cui.html"

rep=urllib.request.Request(url)
try:
    data=urllib.request.urlopen(rep)
except urllib.error.URLError as e:
        if hasattr(e,'code'):
            print("HTTPError")
            print(e.code)
        if hasattr(e,'reason' ):
            print("URLError")
            print(e.reason)


输出结果：


C:\Python34\python.exe G:/xiaoshuo2.py
HTTPError
403
URLError
Forbidden

Process finished with exit code 0





5、python打印防止换行和换行


https://www.cnblogs.com/kfx2007/p/5970784.html


实例：


# coding=utf-8
import re

language = '''''
<table class="infobox bordered vcard" style="width: 21em; font-size: 89%; text-align: left;" cellpadding="3">
<caption style="text-align: center; font-size: larger;" class="fn"><b>jenkins</b></caption>
<tr>
<th>性別：</th>
<td>男</td>d
</tr>
<tr>
<th>異名：</th>
<td><span class="nickname">(字) 翔宇</span></td>
</tr>
<tr>
<th>爱好：</th>
<td><span class="org"><a href="../articles/%E4%B8%AD9A.html" title="篮球">篮球</a></span></td>
</tr>
<tr>
<th>籍貫：</th>
<td><a href="../articles/%E6%B5%9981.html" title="广西省">广西省</a><a href="../articles/%E7%BB%8D82.html" title="桂林市">桂林市</a></td>
</tr>
</table>
'''

#获取table中tr值
res_tr = r'<tr>(.*?)</tr>'
m_tr =  re.findall(res_tr,language,re.S|re.M)
for line in m_tr:
    #获取表格第一列th 属性
    res_th = r'<th>(.*?)</th>'
    m_th = re.findall(res_th,line,re.S|re.M)
    for mm in m_th:
        if "href" in mm: #如果获取加粗的th中含超链接则处理
            restr = r'<a href=.*?>(.*?)</a>'
            h = re.findall(restr,mm,re.S|re.M)
            print (h[0],end=' ') #逗号连接属性值 防止换行
        else:
            print (mm,end=' ')   #unicode防止乱

    #获取表格第二列td 属性值
    res_td = r'<td>(.*?)</td>'  #r'<td .*?>(.*?)</td>'
    m_td = re.findall(res_td,line,re.S|re.M)
    for nn in m_td:
        if "href" in nn: #处理超链接<a href=../rel=..></a>
            res_value = r'<a .*?>(.*?)</a>'
            m_value = re.findall(res_value,nn,re.S|re.M)
            for value in m_value:
                print (value,end=' ')
        elif "span" in nn: #处理标签<span>
            res_value = r'<span .*?>(.*?)</span>'
            m_value = re.findall(res_value,nn,re.S|re.M) #<td><span class="nickname">(字) 翔宇</span></td>
            for value in m_value:
                print (value,end=' ')
        else:
            print (nn,end=' ')
        print (' ') #换行



C:\Python34\python.exe G:/xiaoshuo2.py
性別： 男  
異名： (字) 翔宇  
爱好： 篮球  
籍貫： 广西省 桂林市  



6、python打印如何呢不换行


https://www.cnblogs.com/hwd9654/p/5707920.html


# -*- coding:utf-8 -*-
import urllib
import re
#import requests
import urllib.parse
import urllib.request
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError

class Tool:
    removeImg = re.compile('<img.*?>| {7}|')
    removeAddr = re.compile('<a.*?>|</a>')
    replaceLine = re.compile('<tr>|<div>|</div>|</p>')
    replaceTD= re.compile('<td>')
    replacePara = re.compile('<p.*?>')
    replaceBR = re.compile('<br><br>|<br>')
    removeExtraTag = re.compile('<.*?>')
    def replace(self,x):
        x = re.sub(self.removeImg,"",x)
        x = re.sub(self.removeAddr,"",x)
        x = re.sub(self.replaceLine,"\n",x)
        x = re.sub(self.replaceTD,"\t",x)
        x = re.sub(self.replacePara,"\n",x)
        x = re.sub(self.replaceBR,"\n",x)
        x = re.sub(self.removeExtraTag,"",x)
        return x.strip()

class BDTB:
    def __init__(self,baseUrl,seeLZ):
        self.baseURL = baseUrl
        self.seeLZ = '?see_lz='+str(seeLZ)
        self.tool = Tool()
    def getPage(self,pageNum):
        try:
            url = self.baseURL+ self.seeLZ + '&pn=' + str(pageNum)
            request = urllib.request.Request(url)
            response = urllib.request.urlopen(request).read().decode("utf8")
            #print (response)

            return response
        except urllib.error.URLError as e:
            if hasattr(e,"reason"):
                print ("连接百度贴吧失败,错误原因",e.reason)
                return None
    def getTitle(self):
        page = self.getPage(1)
        pattern = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>',re.S)
        result = re.search(pattern,page)
        if result:
        #print (result.group(1))
            return result.group(1).strip()
        else:
            return None

    def getPageNum(self):
        page = self.getPage(1)
        pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>',re.S)
        result = re.search(pattern,page)
        #print (result.group(1))
        if result:
            return result.group(1).strip()
        else:
            return None
    def getContent(self,page):
        pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)
        items = re.findall(pattern,page)
        floor = 1
        for item in items:
            print (floor,"楼-------------------------------------------------------------------------------------\n",end='')
            #print ("楼---------------------------------------------------------------------------------------------------------------\n")
            print (self.tool.replace(item))
            floor += 1


baseURLh = 'http://tieba.baidu.com/p/3138733512'
bdtb = BDTB(baseURLh,1)
bdtb.getContent(bdtb.getPage(1))




打印结果：

C:\Python34\python.exe C:/Users/Administrator/ceshi.py
1 楼-------------------------------------------------------------------------------------
很多媒体都在每赛季之前给球员排个名，我也有这个癖好…………，我会尽量理性的分析球队地位，个人能力等因素，评出我心目中的下赛季50大现役球员，这个50大是指预估他本赛季在篮球场上对球队的影响力……不是过去的荣誉什么的，所以难免有一定的主观性……如果把你喜欢的球星排低了，欢迎理性讨论！

状元维金斯镇楼
P.S 1 我每天都至少更新一个，不TJ。
      2 今年的新秀我就不考虑了，没上赛季参照
2 楼-------------------------------------------------------------------------------------
50 惊喜新人王 迈卡威
上赛季数据
篮板 6.2  助攻 6.3  抢断 1.9 盖帽  0.6 失误 3.5 犯规  3  得分 16.7

新赛季第50位，我给上赛季的新人王迈卡威。 上赛季迈卡威在彻底重建的76人中迅速掌握了球队，一开始就三双搞定了热火赢得了万千眼球。后来也屡屡有经验的表现，新秀赛季就拿过三双的球员不多，迈卡威现在可以说在76人站稳了脚跟。




7、python爬虫xpath的语法


http://www.cnblogs.com/lonenysky/p/4649455.html


//*[@id="AD_4586850"]/div[1]/strong/i

//*[@id="shop_list"]/div[1]/strong/i
//*[@id="shop_list"]




8、requests用法


http://cuiqingcai.com/2556.html

#-*—coding:utf8-*-
from lxml import etree
import requests
import re
#编码转换
import sys
#headers构造一个字典，里面保存了user-agent
#headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' }
html = requests.get('http://cuiqingcai.com')
print(html.text)



9、sub使用


http://blog.csdn.net/lovemianmian/article/details/8867613


1、去除imag标签

import re
text ='<imgJGood is a handsome boy,>         he is cool, clever, and so on...'
removeImg = re.compile('<img.*?>')
s=re.sub(removeImg,"",text).strip()
print (s)


C:\Python34\python.exe G:/xiaoshuo2.py
he is cool, clever, and so on...





1、1 单独去除7位长空格

import re
text ='<imgJGood is a handsome boy,>         he is cool, clever, and so on...'
removeImg = re.compile('| {7}|')
s=re.sub(removeImg,"",text).strip()
print (s)

打印

C:\Python34\python.exe G:/xiaoshuo2.py
<imgJGood is a handsome boy,>         he is cool, clever, and so on...




2、去除imag标签 + 去除7位长空格


import re
text ='<imgJGood is a handsome boy,>         he is cool, clever, and so on...'
removeImg = re.compile('<img.*?>| {7}|')
s=re.sub(removeImg,"",text).strip()
print (s)



打印：


C:\Python34\python.exe G:/xiaoshuo2.py
he is cool, clever, and so on...

Process finished with exit code 0




3、去除imag标签 + 保留7位长空格


import re
text ='<imgJGood is a handsome boy,>         he is cool, clever, and so on...'


removeImg = re.compile('<img.*?>{7}')
s=re.sub(removeImg,"",text).strip()
print (s)


打印：


C:\Python34\python.exe G:/xiaoshuo2.py
<imgJGood is a handsome boy,>         he is cool, clever, and so on...

Process finished with exit code 0



4、把两个标签中间的内容去掉



import re
text='<a href="http://jump2.bdimg.com/safecheck/index?url=x+Z5)">迈卡威</a>刷出了不错的数据'
removeImg = re.compile('<a.*?>|</a>')
s=re.sub(removeImg,"",text).strip()
print (s)


打印：


C:\Python34\python.exe G:/xiaoshuo2.py
迈卡威刷出了不错的数据




5，把<br>换行符换成/n 换行符



import re
text ='height="510"><br><br><br><br>状元维金斯镇楼<br>P.S 1 我每天都至少更新一个，不TJ。<br>      2 今年的新秀我就不考虑了，没上赛季参照'
removeImg = re.compile('<br><br>|<br>')
s=re.sub(removeImg,"\n",text).strip()
print (s)



C:\Python34\python.exe G:/xiaoshuo2.py
height="510">

状元维金斯镇楼
P.S 1 我每天都至少更新一个，不TJ。
      2 今年的新秀我就不考虑了，没上赛季参照




5.1，把<br>换行符换成/n 换行符


import re
text ='height="510"><br><br><br><br>状元维金斯镇楼<br>P.S 1 我每天都至少更新一个，不TJ。<br>      2 今年的新秀我就不考虑了，没上赛季参照'
removeImg = re.compile('<br>')
s=re.sub(removeImg,"\n",text).strip()
print (s)



C:\Python34\python.exe G:/xiaoshuo2.py
height="510">



状元维金斯镇楼
P.S 1 我每天都至少更新一个，不TJ。
      2 今年的新秀我就不考虑了，没上赛季参照



10、正则表达式


<div class="list-item">
            <div class="personal-info">
                <div class="pic-word">
                    <div class="pic s60">
                        <a href="//mm.taobao.com/687471686.htm" target="_blank" class="lady-avatar">        <img src="//gtd.alicdn.com/sns_logo/i2/TB1XZ1PQVXXXXaJXpXXSutbFXXX.jpg_60x60.jpg" alt="" width="60" height="60"/>
</a>
                    </div>
                    <p class="top">
                    <a class="lady-name" href="//mm.taobao.com/self/model_card.htm?user_id=687471686" target="_blank">jenkins</a>
                    <em><strong>27</strong>岁</em>
                    <span>广州市</span>

pattern = re.compile('<div class="list-item">.*? href="(.*?)".*? src="(.*?)".*? target="_blank">(.*?)</a>.*?<strong>(.*?)</strong>.*?<span>(.*?)</span>',re.S)
或者

pattern = re.compile('<div class="list-item">.*?<a href="(.*?)".*?<img src="(.*?)".*?<a class="lady-name".*?>(.*?)</a>.*?<strong>(.*?)</strong>.*?<span>(.*?)</span>',re.S)



https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gbk&word=%C3%C0%C5%AE%CD%BC%C6%AC&fr=ala&ala=1&alatpl=cover&pos=0&hs=2&xthttps=111111
posted @ 2018-12-01 15:17 effortsing 阅读(325) 评论(0) 编辑收藏举报
刷新页面返回顶部
effortsing

python爬虫前提技术

公告