Python爬虫
from requests import request resp = request('get', 'http://www.baidu.com') print resp.content
Python爬虫遇到IOError或连接失败等将headers补充全尝试
爬取贴吧图片
import urllib2
import urllib
import re
import time
def gethtml(url):
page = urllib2.Request(url)
html = urllib2.urlopen(page)
return html.read()
def imgget(html):
reg = r'src="(.+?\.jpg)" size='
img = re.compile(reg)
imglist = re.findall(img, html)
# return imglist
for imgurl in imglist:
x = long(time.time()*1000)
urllib.urlretrieve(imgurl, r'E:\MyPro\code\spidersTest\image\%s.jpg' % x)
for i in range(1, 10):
time.sleep(1)
print 'start catch page %d' % i
html = gethtml("https://tieba.baidu.com/p/4844779320?pn=%d" % i)
imgget(html)
Python爬取的网页read一次之后再次read会出问题,可将状态恢复至read前的状态
def getpage(self, pn): try: url = self.baseurl+self.lzonly+'&pn='+pn.__str__() request = urllib2.Request(url) response = urllib2.urlopen(request) # print response.read() return response.read() except urllib2.URLError, e: if hasattr(e, "reason"): print u'连接错误,原因:'+e.reason return None
爬取贴吧帖子
# -*- coding: utf-8 -*- __author__ = 'P00113' import urllib2 import urllib import re import time # 处理页面标签类 class Tool: # 去除img标签,7位长空格 removeImg = re.compile('<img.*?>| {7}|') # 删除超链接标签 removeAddr = re.compile('<a.*?>|</a>') # 把换行的标签换为\n replaceLine = re.compile('<tr>|<div>|</div>|</p>') # 将表格制表<td>替换为\t replaceTD = re.compile('<td>') # 把段落开头换为\n加空两格 replacePara = re.compile('<p.*?>') # 将换行符或双换行符替换为\n replaceBR = re.compile('<br><br>|<br>') # 将其余标签剔除 removeExtraTag = re.compile('<.*?>') def replace(self, x): x = re.sub(self.removeImg, "", x) x = re.sub(self.removeAddr, "", x) x = re.sub(self.replaceLine, "\n", x) x = re.sub(self.replaceTD, "\t", x) x = re.sub(self.replacePara, "\n ", x) x = re.sub(self.replaceBR, "\n", x) x = re.sub(self.removeExtraTag, "", x) # strip()将前后多余内容删除 return x.strip() class NZTB(object): def __init__(self, baseurl, lzonly): self.baseurl = baseurl self.lzonly = '?see_lz=' + lzonly.__str__() self.tool = Tool() def getpage(self, pn): try: url = self.baseurl + self.lzonly + '&pn=' + pn.__str__() request = urllib2.Request(url) response = urllib2.urlopen(request) # print response.read() return response.read() except urllib2.URLError, e: if hasattr(e, "reason"): print u'连接错误,原因:' + e.reason return None def gettitle(self): html = self.getpage(1) # html = '''%s''' % html # reg = r'<h3 class="core_title_txt.*?">(.*?)</h3>' pat = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>', re.S) res = re.search(pat, html) if res: # print '%s' % ('-'*100) print res.group(1) print res.group(1) return res.group(1).strip() else: print '%s' % ('-' * 100) return None def getpn(self): html = self.getpage(1) pat = re.compile('<li class="l_reply_num.*?<span.*?>(.*?)</span>', re.S) res = re.search(pat, html) if res: print res.group(1) return res.group(1) else: print '*****' return None def getcontent(self, pn): html = self.getpage(pn) pat = re.compile('<div id="post_content_.*?">(.*?)</div>', re.S) # 匹配楼层正文 res = re.findall(pat, html) # f_pat = re.compile('<div class="post-tail-wrap"><span.*?</span><span.*?</span><span.*?>(.*?)</span>', re.S) f_pat = re.compile('<div class="post-tail-wrap"><span.*?>(\d*)楼', re.S) # 匹配楼层 f_res = re.findall(f_pat, html) # for v in f_res: # print v for val, f in zip(res, f_res): # print val v = self.tool.replace(val) if v: print f, u"楼%s" % ('-' * 100) print v, '\n' # floor += 1 else: continue if __name__ == '__main__': baseurl = 'http://tieba.baidu.com/p/5058456989' a = NZTB(baseurl, 0) # a.getpage(1) for i in range(1, 4): a.getcontent(i)
Python连接数据库时出现 UnicodeEncodeError: 'latin-1' codec can't encode character
如下加入几行代码解决
import MySQLdb db_para = {'host': '10.10.12.171', 'port': 3306, 'user': 'root', 'passwd': 'Hwroot@com', 'db': 'test'} dbcon = MySQLdb.connect(**db_para) cur = dbcon.cursor() dbcon.set_character_set('utf8') cur.execute('SET NAMES utf8;') cur.execute('SET CHARACTER SET utf8;') cur.execute('SET character_set_connection=utf8;')