Python 爬取贴吧中文不可读解决办法

def tiebaSpider():
kw = input("请输入搜索的贴吧:")
beginPage = int(input("开始页数:"))
endPage = int(input("结束页数:"))
url = "http://tieba.baidu.com/f?ie=utf-8&"
key = urllib.parse.urlencode({"kw": kw})
url = url + key
spider(url, beginPage, endPage)


def spider(url, beginPage, endPage):
for page in range(beginPage, endPage):
pn = (page - 1) * 50
filename = "第" + str(page) + "页.HTML"
fullurl = url + "&pn=" + str(pn)
print(fullurl)

html = loadPage(fullurl, filename)
html = str(html, 'utf-8')
# python 3输出位串,而不是可读的字符串,需要对其进行转换
# 使用str(string[, encoding])对数组进行转换
print(repr(html))
writeFile(html, filename)


def loadPage(fullurl, filename):
print("loading")
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
request = urllib.request.Request(fullurl, headers=headers)
# request.add_header("Accept-Encoding", "gzip, deflate")
response = urllib.request.urlopen(request)
return response.read()


def writeFile(html, filename):
print("printing")
with open(filename, "w") as f:
f.write(str(html))

posted on 2018-02-26 11:03  linuxSu  阅读(190)  评论(0编辑  收藏  举报

导航