使用多个代理IP爬某投资机构的公司信息

使用requests+beautifulsoup4爬取某网站的项目信息,此网站比较复杂,所需的数据在6个不同网页中,每抓取一条数据需要打开6个不同网页取数据,然后保存在某一条数据的数据字典中。某些数据有多条,而且每一条有可能还需要细分,所有数据格式层级非常多,最多有5层数据。此网站使用禁止IP的方式来反扒,而且手段非常凶残,只要快速连续爬取30个页面左右,就立即会禁止该网段IP的访问(及其傻逼的方式,会误杀大量客户,而且正常客户的正常访问在这种严苛的封杀下无法访问),当前是使用大量抓取免费的ip来做ip池,验证ip的可用性,每爬3个项目换一个ip的方式来爬取。详情见下面代码。

1.配置文件:

'''
config for the spiders
'''
import random
from bs4 import BeautifulSoup
TIMEOUT = 5
RANGE = (1,10000)
OUTPUT_FILE_PATH = r'innotree_1-10000.txt'
ERROR_FILE_PATH = r'innotree_missing_1-10000.txt'
THREADNUM = 30
TEST_URL='http://www.ip138.com/'
'''
USER_AGENTS 随机头信息
'''
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]

2.代理服务器逻辑文件:

#coding:utf-8
from gevent import monkey
monkey.patch_all()
from bs4 import BeautifulSoup
import requests
import urllib
import urllib2
import re
import requests as req
import json
import time
from time import gmtime, strftime
from gevent.pool import Pool
import sqlite3
from lxml import html
import socket
import random
from config1 import USER_AGENTS, THREADNUM, RANGE, OUTPUT_FILE_PATH, ERROR_FILE_PATH, TEST_URL, TIMEOUT
from parser import parse_intro, parse_main, parse_rank, parse_team, parse_financing, parse_customer


class Innotree_Spider(object):
def __init__(self):
self.crawl_pool = Pool(THREADNUM)
#self.ip_pool = self.get_ip_db()
self.headers = self.get_headers()
def get_ip_db(self):
#连接ip池
print strftime("%Y-%m-%d %H:%M:%S", gmtime()), 'get new IP pool'
conn = sqlite3.connect('/home/yxf/projects/IPProxys_yxf/data/IpPool.db')
c = conn.cursor()
c.execute(
'''
select * from proxys where speed < 5.0
'''
)

l = c.fetchall()
conn.commit()
conn.close()
ip_pool=[]
j=0
#将有效的ip保存到列表中
l = ['http://'+str(i[1])+':'+str(i[2]) for i in l]
print 'Raw IP Pool Size is {}'.format(len(l))
ip_pool = self.crawl_pool.map(self.valid_ip,l)
ip_pool = [i for i in ip_pool if i is not None]
print ip_pool
print 'Validated IP Pool Size is {}'.format(len(ip_pool))
return ip_pool
def get_headers(self):
headers = {
'User-Agent': random.choice(USER_AGENTS),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate',
}
return headers

def try_ip(self, url,proxies,header):
try:
response = requests.get(url,headers = header, proxies=proxies,timeout=10)
return response
except Exception,e:
return 250

def downloader(self, url,proxy, header):


#print proxies
response = self.try_ip(url,proxy,header)
if response == 250:
status = 250
else:
status = response.status_code

return status, response


#检验IP的有效性
def valid_ip(self,ip):
proxy = {'http':ip}
try:
r = requests.get(url=TEST_URL,headers=self.headers,timeout=TIMEOUT,proxies=proxy)

if r.ok:

return ip
else:
return None
except Exception,e:
return None
def crawl(self, data):
i, ip, header = data
proxy = {'http':ip}
baseinfo = {}

url = 'http://www.innotree.cn/project_extra/detail/'+str(i)+'.html'
url1 = 'http://www.innotree.cn/project_extra/detail/'+str(i)+'.html?tab=introduction'
url2 = 'http://www.innotree.cn/project_extra/detail/'+str(i)+'.html?tab=finance'
url3 = 'http://www.innotree.cn/project_extra/detail/'+str(i)+'.html?tab=team'
url4 = 'http://www.innotree.cn/project_extra/detail/'+str(i)+'.html?tab=rank'
url5 = 'http://www.innotree.cn/project_extra/detail/'+str(i)+'.html?tab=customer'

baseinfo['url'] = url
baseinfo['index'] = i

status0, response0 = self.downloader(url,proxy,header)

if status0 == 200:
baseinfo0 = parse_main(response0.content)
baseinfo['main'] = baseinfo0
if baseinfo0['menu_intro']=="true":
time.sleep(0.1)
status1, response1 = self.downloader(url1,proxy,header)
if status1 == 200:
baseinfo1 = parse_intro(response1.content)
baseinfo['intro'] = baseinfo1
else:
return status1, {'index':i,'ip':ip,'error':'intro'}

if baseinfo0['menu_finance']=="true":
time.sleep(0.1)
status2, response2 = self.downloader(url1,proxy,header)
if status2 == 200:
baseinfo2 = parse_intro(response2.content)
baseinfo['finance'] = baseinfo2
else:
return status2, {'index':i,'ip':ip,'error':'finance'}

if baseinfo0['menu_team']=="true":
time.sleep(0.1)
status3, response3 = self.downloader(url1,proxy,header)
if status3 == 200:
baseinfo3 = parse_intro(response3.content)
baseinfo['team'] = baseinfo3
else:
return status3, {'index':i,'ip':ip, 'error':'team'}

if baseinfo0['menu_rank']=="true":
time.sleep(0.1)
status4, response4 = self.downloader(url1,proxy,header)
if status4 == 200:
baseinfo4 = parse_intro(response4.content)
baseinfo['rank'] = baseinfo4
else:
return status4, {'index':i,'ip':ip, 'error':'rank'}

if baseinfo0['menu_customer']=="true":
time.sleep(0.1)
status5, response5 = self.downloader(url1,proxy,header)
if status5 == 200:
baseinfo5 = parse_intro(response5.content)
baseinfo['customer'] = baseinfo5
else:
return status5, {'index':i,'ip':ip, 'error':'customer'}
print i,' ', url
return status0, baseinfo
else:
return status0, {'index':i,'ip':ip, 'error':'main'}



def run_concurrency(self):
self.f=open(OUTPUT_FILE_PATH,'w+')
self.log = open(ERROR_FILE_PATH,'w+')



retry = range(RANGE[0],RANGE[1])
retry_temp = []
while len(retry) != 0:
retry.extend(retry_temp)
retry_temp = []
if len(retry)< 100:
step = 10
else:
step = 100

self.ip_pool = self.get_ip_db()
for i in retry[::step]:
start = i
end = i + step
if end > len(retry):
end = len(retry-1)
data = [(index,random.choice(self.ip_pool),self.get_headers()) for index in retry[start:end]]
baseinfo_list = self.crawl_pool.map(self.crawl,data)
remove_ip = set()
count1 = 0
temp = []
for status, baseinfo in baseinfo_list:
if status == 200:
self.f.write(json.dumps(baseinfo))
self.f.write('\n')
count1 = count1 + 1
elif status == 500:
self.log.write(json.dumps(baseinfo))
self.log.write('\n')
else:
print baseinfo
remove_ip.add(baseinfo['ip'])
temp.append(baseinfo['index'])
print 'Removing Failed IPs, Success: {} and Retry: {}'.format(count1,len(temp))
retry_temp.extend(temp)
for ip in remove_ip:
self.ip_pool.remove(ip)

if len(self.ip_pool) < 20:
print 'Getting New IP Pool'
self.ip_pool = self.get_ip_db()

retry = []

self.f.close()
self.log.close()
def run(self):
self.f=open(OUTPUT_FILE_PATH,'w+')
self.log = open(ERROR_FILE_PATH,'w+')
for i in range(RANGE[0],RANGE[1]):
baseinfo = self.crawl(i)

if baseinfo['main'] != None:
self.f.write(json.dumps(baseinfo))
self.f.write('\n')
else:
self.log.write(json.dumps(baseinfo))
self.log.write('\n')

self.f.close()
self.log.close()
if __name__=="__main__":
spider = Innotree_Spider()
spider.run_concurrency()
#spider.run()

3.爬虫业务主程序:

#coding:utf-8
from bs4 import BeautifulSoup
def parse_main(content):
soup = BeautifulSoup(content,"lxml")
#格式化数据字典
baseInfo = {'name':'','round':'','tag':'','trade':'','website':'','des':'','company':'','tel':'','emial':'','time':'',
'addr':'','financing':'','com_owner':'','reg_money':'','owner_num':'','business_scope':'',
'reg_num':'','social_num':'','operate_state':'','agency_num':'','company_type':'','redistration':'','detail_addr':'','menu_intro':'false','menu_finance':'false','menu_team':'false','menu_rank':'false','menu_customer':'false'}

#判断注册信息菜单是否存在
if len(soup.select('.icon-nwico_ctree')):
baseInfo['menu_intro']="true"
#判断融资情况菜单是否存在
if len(soup.select('.icon-wico_mny')):
baseInfo['menu_finance']="true"
#判断团队能力菜单是否存在
if len(soup.select('.icon-wico_tccle2')):
baseInfo['menu_team']="true"
#判断数据表现菜单是否存在
if len(soup.select('.icon-wico_tsxt')):
baseInfo['menu_rank']="true"
#判断获客能力菜单是否存在
if len(soup.select('.icon-nwico_ympen')):
baseInfo['menu_customer']="true"

for name in soup.select('.details_0629_right_div01_right01_div01_span02'):
baseInfo['name']=name.get_text().replace('\n', '').replace(' ', '')#项目名称
for round in soup.select('.details_0629_right_div01_right01_div01_span01'):
baseInfo['round']=round.get_text().replace('\n', '').replace(' ', '')#投资轮次
#标签列表
taglist=[]
for tag in soup.select('.details_0629_right_div01_right01_div03_div01 a'):
taglist.append(tag.get_text().replace('\n', '').replace(' ', ''))
baseInfo['tag']=taglist#标签
for trade in soup.select('.details_0629_right_div01_right01_div03_div02'):
baseInfo['trade']=soup.select('.details_0629_right_div01_right01_div03_div02')[0].get_text().replace('\n', '').replace(' ', '').split('>', 1 )#行业
baseInfo['website']=soup.select('.details_0629_right_div01_right01_div03_div02')[1].get_text().replace('\n', '').replace(' ', '')#网站
for des in soup.select('#introduction_container'):
baseInfo['des']=des.get_text().replace('\n', '').replace(' ', '')#基本信息
for deta in soup.select('.details_0629_right_div02_right_div01 span'):
baseInfo['company']=soup.select('.details_0629_right_div02_right_div01 span')[1].get_text().replace('\n', '').replace(' ', '')#公司名称
baseInfo['tel']=soup.select('.details_0629_right_div02_right_div01 span')[3].get_text().replace('\n', '').replace(' ', '')#联系电话
baseInfo['emial']=soup.select('.details_0629_right_div02_right_div01 span')[5].get_text().replace('\n', '').replace(' ', '')#电子邮箱
for deta1 in soup.select('.details_0629_right_div02_right_div02 span'):
baseInfo['time']=soup.select('.details_0629_right_div02_right_div02 span')[1].get_text().replace('\n', '').replace(' ', '')#成立时间
baseInfo['addr']=soup.select('.details_0629_right_div02_right_div02 span')[3].get_text().replace('\n', '').replace(' ', '')#所在地区
for financing in soup.select('.details_0629_right_div03_div01 table'):
baseInfo['financing']=financing.get_text()
return baseInfo
def parse_intro(content1):
soup1 = BeautifulSoup(content1,"lxml")
baseInfo = {}
for zcxx in soup1.select('.pmiz_big'):
baseInfo['com_owner']=soup1.select('.pmiz_big')[0].get_text().replace('\n', '').replace(' ', '')#法定代表人
baseInfo['reg_money']=soup1.select('.pmiz_big')[1].get_text().replace('\n', '').replace(' ', '')#注册资本
baseInfo['owner_num']=soup1.select('.pmiz_big')[3].get_text().replace('\n', '').replace(' ', '')#当前股东人数
for zcxx1 in soup1.select('.pmic_info'):
baseInfo['reg_num']=soup1.select('.pmic_info')[0].get_text().replace('\n', '').replace(' ', '')#注册号
baseInfo['social_num']=soup1.select('.pmic_info')[1].get_text().replace('\n', '').replace(' ', '')#统一社会信用代码
baseInfo['operate_state']=soup1.select('.pmic_info')[2].get_text().replace('\n', '').replace(' ', '')#经营状态
baseInfo['agency_num']=soup1.select('.pmic_info')[3].get_text().replace('\n', '').replace(' ', '')#组织机构代码
baseInfo['company_type']=soup1.select('.pmic_info')[4].get_text().replace('\n', '').replace(' ', '')#公司类型
baseInfo['redistration']=soup1.select('.pmic_info')[5].get_text().replace('\n', '').replace(' ', '')#登记机关
for business_scope in soup1.select('#business_scope_container .show_line_inner'):
baseInfo['business_scope']=business_scope.get_text().replace('\n', '').replace(' ', '')#经营范围
for detail_addr in soup1.select('.pmi_conect .details_0629_right_div02_right_div01_span01'):
baseInfo['detail_addr']=soup1.select('.pmi_conect .details_0629_right_div02_right_div01_span01')[0].get_text().replace('\n', '').replace(' ', '')#公司具体地址
return baseInfo

def parse_financing(content2):
pattern = re.compile('var finance_json = {(.*?)} ;')
items = re.findall(pattern,content2)
baseInfo['financing'] = items
return baseInfo
def parse_team(content3):
baseInfo = {'teamer':''}
soup3 = BeautifulSoup(content3,"lxml")
teamLen=len(soup3.select('.org_show_one'))
team_info=[]
if teamLen:
for i in range(0,teamLen):
team_item=[]
tags=[]
team_item.append(soup3.select('.org_show_one h2')[i].get_text().replace('\n', '').replace(' ', ''))
#遍历tag
for tag in (soup3.select('.org_p_tags1')[i]).find_all('span'):
tags.append(tag.get_text().replace('\n', '').replace(' ', ''))
team_item.append(tags)

job_intro=[]
for exp in (soup3.select('.org_p_info .org_p_resume')[i]).find_all('li'):
print exp.select('.org_office')[0].get_text().replace('\n', '').replace(' ', '')
job_item=[]
job_item.append(exp.find('h3').get_text().replace('\n', '').replace(' ', ''))
if len(exp.select('.dates')):
job_item.append(exp.select('.dates')[0].get_text().replace('\n', '').replace(' ', ''))
else:
job_item.append('')
if len(exp.select('.org_office')):
job_item.append(exp.select('.org_office')[0].get_text().replace('\n', '').replace(' ', ''))
else:
job_item.append('')
job_item.append(exp.find('p').get_text().replace('\n', '').replace(' ', ''))
job_intro.append(job_item)
team_item.append(job_intro)

team_info.append(team_item),
baseInfo['teamer'] = team_info
return baseInfo
def parse_customer(content5):
pattern5 = re.compile('customer_rank_compare_json = (.*?);',re.S)
items5 = re.findall(pattern5,content5)
return items5
#抓取数据表现页面
def parse_rank(content4):
pattern4 = re.compile('"industry_rank":(.*?)"app_rank":(.*?),"web_rank":(.*?),"media_rank":(.*?)};',re.S)
items4 = re.findall(pattern4,content4)
return items4

posted @ 2016-09-05 19:08  骑一头小猪  阅读(353)  评论(0编辑  收藏  举报