骑骡子赶猪  
async def beijing_spider():
'''开始抓'''
#北京采购网

tim=time_spider()
for i in range(1,31):#设置20 为每天抓
'''选择71 刚好是一个月'''
print('北京采购网第%s页' % i)
url='http://www.ccgp-beijing.gov.cn/xxgg/sjzfcggg/sjzbjggg/index_%s.html'% i
try:
proxies = random.sample(proxies_list, 1)[0]

page = requests.get(url=url, headers=headers,proxies=proxies).text
print(111, proxies)
except:
proxies = random.sample(proxies_list, 1)[0]

page = requests.get(url=url, headers=headers,proxies=proxies).text
print(222, proxies)
tree = etree.HTML(page)
ul_list = tree.xpath('//ul[@class="xinxi_ul"]/li')
for ul in ul_list:
name = ul.xpath('./a/text()')[0].strip()
for ii in ll:
if ii in name:
new_url = 'http://www.ccgp-beijing.gov.cn/xxgg/sjzfcggg/sjzbjggg/' + ul.xpath('./a/@href')[0].split('/')[-1]
datatime = ul.xpath('./span/text()')[0].strip()
pag= requests.get(url=new_url, headers=headers).text
source=name+datatime
source_id = hashlib.md5(source.encode()).hexdigest() #设置唯一id 做去重

#去重
sql="select source from ZHAOBIAO_SPIDER where source='%s' "%source_id
ret = ora_obj.open_oracle(sql)
ora_obj.off()
if len(ret)>0:
print('%s>>>已经有了'% name)
else:
#没有就添加
sql1="insert into ZHAOBIAO_SPIDER values(sys_guid(),'%s','%s',to_date('%s','yyyy-mm-dd'),'%s',to_date('%s','yyyy-mm-dd'))" %(source_id,name,datatime,new_url,tim)
ora_obj.oracle_work(sql1)
ora_obj.off()
filename=os.path.join(pat,'html','%s.html')% source_id
with open(filename,'w')as f:
f.write(pag)
print(source_id, name, datatime, new_url, tim)
return


async def jincaiwang_spider():
'''开始抓 金菜网'''

tim=time_spider()
for i in range(1,31): #设置20 为每天抓
'''选择71 刚好是一个月'''
print('金菜网第%s页'% i)
url='http://www.cfcpn.com/plist/caigou?pageNo=%s'% i
try:
proxies = random.sample(proxies_list, 1)[0]

page = requests.get(url=url, headers=headers).text
print(111, proxies)
except:
proxies = random.sample(proxies_list, 1)[0]

page = requests.get(url=url, headers=headers,proxies=proxies).text
print(222, proxies)
tree = etree.HTML(page)
p_list = tree.xpath('//div[@class="cfcpn_list_content text-left"]')
for p in p_list:
name = p.xpath('./p[1]/a/text()')[0].strip()
for ii in ll:
if ii in name:
new_url = 'http://www.cfcpn.com' + p.xpath('./p[1]/a/@href')[0]
datatime = p.xpath('./p[2]/text()')[0].strip().replace('发布时间:','')[:10]
pag=requests.get(url=new_url, headers=headers).text
source=name+datatime
source_id = hashlib.md5(source.encode()).hexdigest() #设置唯一id 做去重

#去重
sql="select source from ZHAOBIAO_SPIDER where source='%s' "%source_id
ret = ora_obj.open_oracle(sql)
ora_obj.off()
if len(ret)>0:
print('%s>>>已经有了'% name)
else:
#没有就添加
sql1="insert into ZHAOBIAO_SPIDER values(sys_guid(),'%s','%s',to_date('%s','yyyy-mm-dd'),'%s',to_date('%s','yyyy-mm-dd'))" %(source_id,name,datatime,new_url,tim)
ora_obj.oracle_work(sql1)
ora_obj.off()
filename = os.path.join(pat, 'html', '%s.html') % source_id
with open(filename, 'w',encoding='utf8')as f:
f.write(pag)
print(source_id, name, datatime, new_url, tim)

return


async def zhongyang_spider():
'''开始抓 中央采购网'''
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(executable_path=r'conf\chromedriver.exe', chrome_options=chrome_options)
try:
tim=time_spider()
for i in range(1,11):
'''选择11 刚好是一个月'''
print('中央采购网第%s页'% i)
url='http://www.zycg.gov.cn/article/llist?catalog=StockAffiche&page=%s'% i
try:
proxies = random.sample(proxies_list, 1)[0]

page = requests.get(url=url, headers=headers,proxies=proxies).text
print(111, proxies)
except:
proxies = random.sample(proxies_list, 1)[0]

page = requests.get(url=url, headers=headers,proxies=proxies).text
print(222, proxies)
tree = etree.HTML(page)
li_list = tree.xpath('//ul[@class="lby-list"]/li')
for li in li_list:
try:
name = li.xpath('./a/text()')[0].strip()
except:
name=''
for ii in ll:
if ii in name:
new_url = 'http://www.zycg.gov.cn' + li.xpath('./a/@href')[0]
datatime = li.xpath('./span/text()')[0].strip().replace('[','').replace(']','')
source = name + datatime
source_id = hashlib.md5(source.encode()).hexdigest() # 设置唯一id 做去重

#获取页面
driver.get(new_url)
time.sleep(random.uniform(0.5,1))
js = 'window.scrollTo(0,document.body.scrollHeight)'
driver.execute_script(js)
time.sleep(random.uniform(1, 2))
driver.switch_to.frame("ueditor_0")
pag = driver.page_source


#去重
sql="select source from ZHAOBIAO_SPIDER where source='%s' "%source_id
ret = ora_obj.open_oracle(sql)
ora_obj.off()
if len(ret)>0:
print('%s>>>已经有了'% name)
else:
#没有就添加
sql1="insert into ZHAOBIAO_SPIDER values(sys_guid(),'%s','%s',to_date('%s','yyyy-mm-dd'),'%s',to_date('%s','yyyy-mm-dd'))" %(source_id,name,datatime,new_url,tim)
ora_obj.oracle_work(sql1)
ora_obj.off()
filename = os.path.join(pat, 'html', '%s.html') % source_id
with open(filename, 'w', encoding='GB18030')as f:
f.write(pag)
print(source_id, name, datatime, new_url, tim)
except Exception as e:
print(e)
finally:
driver.quit()
return


def start():
print('----------------------%s-----------------------------' % time_time())
try:
# loop = asyncio.get_event_loop() 用这种会出现下面报错 使用apscheduler + asyncio 建议使用以下方式
# 处理报错 RuntimeError: There is no current event loop in thread 'ThreadPoolExecutor-0_0'.
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
bj = loop.create_task(beijing_spider())
jc = loop.create_task(jincaiwang_spider())
zy = loop.create_task(zhongyang_spider())

loop.run_until_complete(asyncio.wait([bj,jc,zy]))
except Exception as e:
logging.error(traceback.format_exc())
finally:
print('---------------------结束---------------------------' )

if __name__ == '__main__':
print('等待中......')
scheduler = BlockingScheduler()
scheduler.add_job(start, 'cron', hour=8, minute=30) # 'interval', seconds=40
try:
scheduler.start()
except (KeyboardInterrupt, SystemExit):
pass
except Exception as e:
print(e)
logging.error(traceback.format_exc())
posted on 2019-08-13 16:32  骑骡子赶猪  阅读(1440)  评论(0编辑  收藏  举报