比较成功完善的爬取代码,装入数据库,去除重复数据,形成表格,产生数据的json串。
# -*- coding:utf-8 -*-
import urllib2, requests
from bs4 import BeautifulSoup
import socket,random
from retrying import retry
import sys
import xlwt
import cx_Oracle
import json
try:
conn = cx_Oracle.connect('xxx/xxx')
cursor = conn.cursor()
cursor.execute('create table tb_user(url varchar2(250), name varchar2(250),introduce varchar(250),address varchar(250))')
except:
print "The table already exists, but please continue"
x = 0
my_dh = 0
ippool = ['118.180.49.24:8080',
'27.184.130.29:8888',
'113.140.43.136:80',
'60.169.19.66:9000',
'60.21.206.165:9999']
@retry
def crawl(url):
rip = random.choice(ippool)
print rip
s = requests.session()
proxies = {
'http': 'http://' + rip,
'https': 'http://' + rip,
}
#print rip
headers = {'User-Agent': 'User-Agent:Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)'}
html = s.get(url, proxies=proxies,timeout=15, headers=headers,).text
s.encoding = 'utf-8'
soup = BeautifulSoup(html, 'html.parser')
my_title = soup.select('.des h2 a')
file = open('F:\yjh2\\xx.txt', 'a')
for phone in my_title:
url2 = phone['href']
rip2 = random.choice(ippool)
proxies = {
'http': 'http://' + rip2,
'https': 'http://' + rip2,
}
#print rip2
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
html = s.get(url2, proxies=proxies, headers=headers,timeout=15).text
s.encoding = 'utf-8'
soup2 = BeautifulSoup(html, 'html.parser')
my_dh = soup2.select('.phone-num')
if len(my_dh) > 0:
my_dh1 = my_dh[0].text
else:
my_dh1 = 'null'
try:
my_man_list = soup2.select('.c_000')
my_man = soup2.select('.c_000')[0].text
my_bt = soup2.select('.c_333.f20')[0].text
my_money = soup2.select('.c_ff552e')[0].text
my_dq = soup2.select('.f14 span a')[1].text
if len(my_man_list) > 0:
massage = url2 + ';' + my_man + ':' + my_dh1 + ';' + my_bt + my_money + ';' + my_dq
param = {'id': url2, 'n': my_man+':'+ my_dh1,'p':my_bt + my_money,'m':my_dq}
print massage
cursor.execute('insert into tb_user values(:id,:n,:p,:m)', param)
conn.commit()
cursor.execute('delete from tb_user where'+'(url)'+ 'in (select url from tb_user group by url having count(url) >1)'+'and rowid not in (select min(rowid) from tb_user group by url having count(url)>1)')
conn.commit()
jsonData = []
cursor.execute('select * from tb_user')
i = 0
wbk = xlwt.Workbook()
sheet = wbk.add_sheet('foobar', cell_overwrite_ok=True)
for row in cursor:
result = {}
result['url'] = row[0]
result['name'] = row[1]
result['jieshao'] = row[2]
result['diqu'] = row[3]
jsonData.append(result)
sheet.write(i, 0, row[0])
sheet.write(i, 1, row[1].decode('utf-8'))
sheet.write(i, 2, row[2].decode('utf-8'))
sheet.write(i, 3, row[3].decode('utf-8'))
i = i + 1
wbk.save("58.xls")
jsondatar = json.dumps(jsonData, ensure_ascii=False, indent=4)
# 对jsondata可以进行数组操作,但是对jsondatar不行
file.write(massage.encode('utf-8') + '\n')
else:
print '空!'
continue
except IndexError, socket.error:
print '!'
pass
for page in range(1, 30):
page += 1
url = 'http://cc.58.com/chuzu/pn{}'.format(page)
crawl(url)
print "下载完成"