人才网的一个抓取,初学python,希望哥哥姐姐们多指导,多批评
from bs4 import BeautifulSoup
import urllib
import urllib.request
import re
# import json
headers={"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}
url="http://www.lankao.ccoo.cn"
for x in range(1,10):
pageurl=url+"/post/zhaopin/pn"+str(x)+"/"
print(pageurl)
req=urllib.request.Request(pageurl,headers=headers)
data=urllib.request.urlopen(req).read().decode('gbk')
soup=BeautifulSoup(data,"lxml")
urllist=soup.select('li span a[class="title"]')
for x in urllist:
myurl=url+x.attrs["href"]
req2=urllib.request.Request(myurl,headers=headers)
data2=urllib.request.urlopen(req2).read().decode()
soup=BeautifulSoup(data2,"lxml")
name=soup.select('div div[class="zMain-titBox"] h2[class="tit"]')[0].get_text()
jbtextlist=soup.select('div[id="describe"] p')
price=soup.select('div div[id="baseInfo"] p')[0].get_text().replace(" ","")
time=soup.select('div div[class="infobox clearfix"] div[class="tabs1 fl"] span[class="tab"]')[0].get_text()
jbtext=""
for i in jbtextlist:
jbtext=jbtext+i.text
print("标题:"+name)
print(time)
print("工资:"+price)
print(jbtext)
print("---------------------------------")