selenium抓取
#encoding:utf-8
from urllib.request import urlopen
from urllib import request,error
from urllib.error import HTTPError
import urllib.error
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from PIL import Image
import pytesseract
from pytesseract import image_to_string
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import UnexpectedAlertPresentException
#鼠标点击,双击时需引入该库
from selenium.webdriver.common.action_chains import ActionChains
#用来判断元素标签是否存在,
import random
import chardet
import os
import openpyxl
from openpyxl import workbook
from openpyxl import load_workbook
def gettitle(url):
my_headers = [
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "
]
randdom_header=random.choice(my_headers)
req = urllib.request.Request(url)
req.add_header("User-Agent",randdom_header)
req.add_header("GET",url)
try:
response = urllib.request.urlopen(req).read()
except HTTPError as e:
print(e.code,'不能打开网页,地址是:',url)
bsobj = 1
return bsobj
except urllib.error.URLError as e:
print(e.reason)
if isinstance(e.reason,socket.timeout):
print('链接超时:',url)
bsobj = 1
return bsobj
except Exception as e:
page = e.partial
bsobj = page.decode(str(chardet.detect(response)['encoding']),"ignore")
html = response.decode(str(chardet.detect(response)['encoding']),"ignore")
bsobj = BeautifulSoup(html)
return bsobj
#写入表格(s[0],s[1],s[2],s[3],s[4],s[5],s[6],s[7])
#保存当前文件名的excel文件
def file_write(a,b='',c='',d='',e='',f='',g=''):
name = os.path.split(__file__)[-1].split(".")[0]
dir = os.path.dirname(__file__)
if os.path.exists(dir+'/'+'{}.xlsx'.format(name)):#判断当前*.xlsx文件,没有则创建一个
print('{}文件存在于目录:{}'.format(name,dir))
else:
wb = openpyxl.Workbook()
ws = wb.active
wb.save(dir+'/'+'{}.xlsx'.format(name))
#打开表格,正常操作
wb=load_workbook(dir+'/'+'{}.xlsx'.format(name))
ws=wb.active
print('载入成功')
bool_value=True
for i in ws['b']: # B列去重
if str(i.value)==str(b):
bool_value=False
print('有重复标题')
if bool_value:
print('写入表格')
ws.append([a,b,c,d,e,f,g])
wb.save(dir+'/'+'{}.xlsx'.format(name))
countylist = [\
"西安","西咸","高新","新城","莲湖","碑林","未央","雁塔","灞桥","阎良","长安","临潼","高陵","鄠邑","蓝田","周至",
"宝鸡","金台","渭滨","陈仓","陇县","千阳","凤翔","麟游","岐山","扶风","眉县","凤县","太白",
"咸阳","秦都","渭城","长武","淳化","旬邑","乾县","永寿","武功","三原","礼泉","泾阳","兴平","彬州",
"铜川","耀州","王益","印台","宜君",
"渭南","临渭","华州","大荔","合阳","白水","澄城","富平","蒲城","潼关","华阴",
"韩城",
"延安","宝塔","安塞","延长","延川","子长","志丹","富县","黄陵","黄龙","洛川","吴起","甘泉","宜川",
"榆林","榆阳","横山","府谷","佳县","米脂","清涧","靖边","定边","绥德","吴堡","子洲","神木",
"汉中","汉台","南郑","城固","洋县","西乡","宁强","略阳","镇巴","留坝","佛坪",
"安康","汉滨","宁陕","石泉","汉阴","旬阳","白河","紫阳","岚皋","平利","镇坪",
"商洛","商州","柞水","镇安","山阳","洛南","丹凤","商南",
"杨陵"]
def fuping(txt):
if txt.count('贫'):return True
if txt.count('丰收'):return True
if txt.count('复工'):return True
if txt.count('金山'):return True
if txt.count('帮扶'):return True
if txt.count('粮'):return True
if txt.count('青山'):return True
if txt.count('沙'):return True
if txt.count('疫'):return True
if txt.count('农民'):return True
if txt.count('小麦'):return True
if txt.count('夏收'):return True
if txt.count('农民'):return True
if txt.count('苏陕'):return True
if txt.count('好日子'):return True
return False
# 输入文本,输出市、县
def xianshi(text):
area = len(countylist) - 1
x = 0
cityname = ""
countyname = ""
text = text.replace('陕西','')
while x < len(text):
area = 0
while area < len(countylist)-1:
if text[x:x+2] == countylist[area]:
countyname = countylist[area]
if -1 < area <16 :
cityname = countylist[0]#西安
if 15 < area <29 :
cityname = countylist[16]#宝鸡
if 28 < area <43 :
cityname = countylist[29]#咸阳
if 42 < area <48 :
cityname = countylist[43]#铜川
if 47 < area <60:
cityname = countylist[48]#渭南
if area == 59 :
cityname = countylist[59]#韩城
if 59 < area <74:
cityname = countylist[60]#延安
if 73 < area <87 :
cityname = countylist[74]#榆林
if 86 < area <98 :
cityname = countylist[87]#汉中
if 97 < area <109 :
cityname = countylist[98]#安康
if 109 < area <117 :
cityname = countylist[109]#商洛
if area == 117 :
cityname = countylist[117]#杨陵
return cityname,countyname
if cityname != "":
break
area += 1
x += 1
return cityname,countyname
def isElementPresent(browser,path):
browser.implicitly_wait(1)
try:
browser.find_element_by_xpath(path)
# element = browser.find_element_by_xpath(path)# 原文是except NoSuchElementException, e:
except NoSuchElementException as e:
print('没有发现标签:',path)
return False # 发生了NoSuchElementException异常,说明页面中未找到该元素,返回False
else:
print('发现此标签:',path)
return True # 没有发生异常,表示在页面中找到了该元素,返回True
def shiping(url):
#视频版
browser = webdriver.Chrome()
browser.implicitly_wait(5) # 隐性等待,最长等30秒
browser.get(url)
objecta = BeautifulSoup(browser.page_source)
time.sleep(1)
for i in range(1,20):
print('打开第{}页++++++'.format(i))
for x in objecta.findAll(class_="jvedio"):
s = [0,1,2,3,4,5,6]
biaoti = x.find('a').attrs['alt']
if biaoti.count('陕西新闻联播'):
s[0] = '陕西广播电视台'
else:
s[0] = '中央电视台'
s[1] = biaoti[:-19]
s[2] = biaoti[-19:-9].replace('-','/')
s[3] = x.find('span').get_text()
s[4] = x.find('a').attrs['lanmu1']
s[5] = xianshi(biaoti)[0]
s[6] = xianshi(biaoti)[1]
if fuping(biaoti):
print(s)
file_write(s[0],s[1],s[2],s[3],s[4],s[5],s[6])
browser.find_element_by_link_text('下一页').click()
time.sleep(0.5)
browser.switch_to_window(browser.window_handles[-1])
objecta = BeautifulSoup(browser.page_source)
time.sleep(1)
# 网页版
browser.find_element_by_xpath('//*[@id="web"]').click()
time.sleep(1)
browser.switch_to_window(browser.window_handles[-1])
objecta = BeautifulSoup(browser.page_source)
for m in range(1,20):
print('打开第{}页++++++'.format(m))
for x in objecta.findAll(class_="tright"):
s = [0,1,2,3,4,5,6]
s[0] = '央视网'
s[1] = x.find('a').get_text()
s[2] = x.find(class_="tim").get_text()[-19:-9].replace('-','/')
s[3] = x.find(class_="src").get_text()
s[4] = x.find('span').attrs['lanmu1']
s[5] = xianshi(x.get_text())[0]
s[6] = xianshi(x.get_text())[1]
if fuping(x.get_text()):
print(s)
file_write(s[0],s[1],s[2],s[3],s[4],s[5],s[6])
browser.find_element_by_link_text('下一页').click()
time.sleep(0.5)
browser.switch_to_window(browser.window_handles[-1])
objecta = BeautifulSoup(browser.page_source)
time.sleep(1)
browser.quit()
shiping(url)