腾讯招聘爬取

import requests
from bs4 import BeautifulSoup
import datetime
import re
import pymysql
import datetime
#数据库封装
class Mydb():
def __init__(self):
try:
self.conn = pymysql.connect('127.0.0.1','root','123456','py11',charset='utf8')
self.cursor = self.conn.cursor()
except Exception as e:
print(e)

def execute(self,sql,data):
try:
res = self.cursor.execute(sql,data)
self.conn.commit()
except Exception as e:
self.conn.rollback()
print(e)

base_url = 'http://hr.tencent.com/position.php?start=%d'

headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
# 处理详情页
def parse_detail(url):
response = requests.get(url,headers=headers)
html = BeautifulSoup(response.text,'lxml')

# 职位标题
postion_name = html.select('tr[class="h"]')[0].text.strip()

# 工作地点
info = html.select('table.tablelist tr')
location = info[1].select('td')[0].contents[-1]
p_type = info[1].select('td')[1].contents[-1]
p_number = info[1].select('td')[2].contents[-1].strip('人')

# 工作职责
duty_list = info[2].select('li')
duty_list = [duty.text for duty in duty_list]
duty = ''.join(duty_list)

# 工作要求
requirement = info[3].select('li')
requirement = [require.text for require in requirement]
requirement = ''.join(requirement)

# 获取系统时间
crawl_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

# 获取url_id
id_pat = re.compile(r'id=(\d+)')
res = id_pat.search(response.url)
url_id = res.group(1)


# 保存数据
sql = 'insert into ceshi(url_id,position_name,location,p_type,p_number,duty,requirement,crawl_time) VALUES(%s,%s,%s,%s,%s,%s,%s,%s) ' \
'on duplicate key update position_name=values(position_name)'
data = [url_id,postion_name,location,p_type,p_number,duty,requirement,crawl_time]

print(postion_name)
mydb.execute(sql,data)

def getPage():
for i in range(0,2920 + 1,10):
fullurl = base_url % i
response = requests.get(fullurl,headers=headers)
html = response.text

# 获取详情页链接地址
html = BeautifulSoup(html,'lxml')
tr_list = html.select('table.tablelist tr')[1:-1]
for tr in tr_list:
detail_link = tr.select('td > a')[0].get('href')
detail_link = 'http://hr.tencent.com/' + detail_link

# 发起详情页请求
parse_detail(detail_link)

if __name__ == '__main__':
mydb = Mydb()
getPage()

posted on 2018-08-19 21:46  luwanhe  阅读(251)  评论(0编辑  收藏  举报

导航