知乎信息爬取(存在bug,望大牛指点)

import requests
from lxml import etree
import pymysql

class MysqlHelper(object):
def __init__(self):
self.db = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='123456', database='py11', charset='utf8')
self.cursor = self.db.cursor()

def execute_modify_sql(self,sql, data):
self.cursor.execute(sql, data)
self.db.commit()

def __del__(self):
self.cursor.close()
self.db.close()

if __name__ == '__main__':
conn = MysqlHelper()
conn.execute_modify_sql('insert into zhihu(title) VALUE (%s)', data=('hehehe'))

base_url = 'https://www.zhihu.com/people/leedaye/answers?page=%s'
headers = {
"User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:41.0) Gecko/20100101 Firefox/41.0',
"Referer": "http://www.zhihu.com/",
'Host': 'www.zhihu.com',
}
myhelper = MysqlHelper()
sql = 'INSERT INTO zhihu(title,tet) VALUES' \
' (%s, %s)'

for i in range(1,21):
url = base_url % i
response = requests.get(url,headers=headers)
html_ele = etree.HTML(response.text)
li_list = html_ele.xpath('//*[@id="Profile-answers"]/div[2]')
for li_ele in li_list:
title = li_ele.xpath('./div[1]/div/h2/div/a')[0].text
print(title)
tet = li_ele.xpath('./div[1]/div/div[2]/div[1]/span')[0].text
data = (title,tet)
myhelper.execute_modify_sql(sql, data)

posted on 2018-08-19 22:04  luwanhe  阅读(270)  评论(0编辑  收藏  举报

导航