Python爬虫

from requests import request
resp = request('get', 'http://www.baidu.com')
print resp.content

Python爬虫遇到IOError或连接失败等将headers补充全尝试

 爬取贴吧图片

import urllib2
import urllib
import re
import time

def gethtml(url):
page = urllib2.Request(url)
html = urllib2.urlopen(page)
return html.read()

def imgget(html):
reg = r'src="(.+?\.jpg)" size='
img = re.compile(reg)
imglist = re.findall(img, html)
# return imglist

for imgurl in imglist:
x = long(time.time()*1000)
urllib.urlretrieve(imgurl, r'E:\MyPro\code\spidersTest\image\%s.jpg' % x)
for i in range(1, 10):
time.sleep(1)
print 'start catch page %d' % i
html = gethtml("https://tieba.baidu.com/p/4844779320?pn=%d" % i)
    imgget(html)

 Python爬取的网页read一次之后再次read会出问题,可将状态恢复至read前的状态

    def getpage(self, pn):
        try:
            url = self.baseurl+self.lzonly+'&pn='+pn.__str__()
            request = urllib2.Request(url)
            response = urllib2.urlopen(request)
            # print response.read()
            return response.read()
        except urllib2.URLError, e:
            if hasattr(e, "reason"):
                print u'连接错误,原因:'+e.reason
                return None

 爬取贴吧帖子

# -*- coding: utf-8 -*-
__author__ = 'P00113'

import urllib2
import urllib
import re
import time


# 处理页面标签类
class Tool:
    # 去除img标签,7位长空格
    removeImg = re.compile('<img.*?>| {7}|')
    # 删除超链接标签
    removeAddr = re.compile('<a.*?>|</a>')
    # 把换行的标签换为\n
    replaceLine = re.compile('<tr>|<div>|</div>|</p>')
    # 将表格制表<td>替换为\t
    replaceTD = re.compile('<td>')
    # 把段落开头换为\n加空两格
    replacePara = re.compile('<p.*?>')
    # 将换行符或双换行符替换为\n
    replaceBR = re.compile('<br><br>|<br>')
    # 将其余标签剔除
    removeExtraTag = re.compile('<.*?>')

    def replace(self, x):
        x = re.sub(self.removeImg, "", x)
        x = re.sub(self.removeAddr, "", x)
        x = re.sub(self.replaceLine, "\n", x)
        x = re.sub(self.replaceTD, "\t", x)
        x = re.sub(self.replacePara, "\n    ", x)
        x = re.sub(self.replaceBR, "\n", x)
        x = re.sub(self.removeExtraTag, "", x)
        # strip()将前后多余内容删除
        return x.strip()


class NZTB(object):
    def __init__(self, baseurl, lzonly):
        self.baseurl = baseurl
        self.lzonly = '?see_lz=' + lzonly.__str__()
        self.tool = Tool()
    def getpage(self, pn):
        try:
            url = self.baseurl + self.lzonly + '&pn=' + pn.__str__()
            request = urllib2.Request(url)
            response = urllib2.urlopen(request)
            # print response.read()
            return response.read()
        except urllib2.URLError, e:
            if hasattr(e, "reason"):
                print u'连接错误,原因:' + e.reason
                return None

    def gettitle(self):
        html = self.getpage(1)
        # html = '''%s''' % html
        # reg = r'<h3 class="core_title_txt.*?">(.*?)</h3>'
        pat = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>', re.S)
        res = re.search(pat, html)
        if res:
            # print '%s' % ('-'*100)
            print res.group(1)
            print res.group(1)
            return res.group(1).strip()
        else:
            print '%s' % ('-' * 100)
            return None

    def getpn(self):
        html = self.getpage(1)
        pat = re.compile('<li class="l_reply_num.*?<span.*?>(.*?)</span>', re.S)
        res = re.search(pat, html)
        if res:
            print res.group(1)
            return res.group(1)
        else:
            print '*****'
            return None

    def getcontent(self, pn):
        html = self.getpage(pn)
        pat = re.compile('<div id="post_content_.*?">(.*?)</div>', re.S)  # 匹配楼层正文
        res = re.findall(pat, html)
        # f_pat = re.compile('<div class="post-tail-wrap"><span.*?</span><span.*?</span><span.*?>(.*?)</span>', re.S)
        f_pat = re.compile('<div class="post-tail-wrap"><span.*?>(\d*)楼', re.S)  # 匹配楼层
        f_res = re.findall(f_pat, html)
        # for v in f_res:
        #     print v
        for val, f in zip(res, f_res):
            # print val
            v = self.tool.replace(val)
            if v:
                print f, u"楼%s" % ('-' * 100)
                print v, '\n'
                # floor += 1
            else:
                continue


if __name__ == '__main__':
    baseurl = 'http://tieba.baidu.com/p/5058456989'
    a = NZTB(baseurl, 0)
    # a.getpage(1)
    for i in range(1, 4):
        a.getcontent(i)

 Python连接数据库时出现  UnicodeEncodeError: 'latin-1' codec can't encode character

如下加入几行代码解决

import MySQLdb


db_para = {'host': '10.10.12.171',
           'port': 3306,
           'user': 'root',
           'passwd': 'Hwroot@com',
           'db': 'test'}
dbcon = MySQLdb.connect(**db_para)
cur = dbcon.cursor()
dbcon.set_character_set('utf8')
cur.execute('SET NAMES utf8;')
cur.execute('SET CHARACTER SET utf8;')
cur.execute('SET character_set_connection=utf8;')

 

posted @ 2017-04-13 11:41  好奇的小明  阅读(245)  评论(0编辑  收藏  举报