selenium抓取

#encoding:utf-8

from urllib.request import urlopen

from urllib import request,error

from urllib.error import HTTPError

import urllib.error

from selenium import webdriver

from bs4 import BeautifulSoup

import time

from PIL import Image

import pytesseract

from pytesseract import image_to_string

from selenium.webdriver.common.action_chains import ActionChains

from selenium.webdriver.support.select import Select

from selenium.webdriver.common.keys import Keys

from selenium.common.exceptions import NoSuchElementException

from selenium.webdriver.support import expected_conditions as EC

from selenium.common.exceptions import UnexpectedAlertPresentException

#鼠标点击，双击时需引入该库

from selenium.webdriver.common.action_chains import ActionChains

#用来判断元素标签是否存在，

import random

import chardet

import os

import openpyxl

from openpyxl import workbook

from openpyxl import load_workbook

def gettitle(url):

my_headers = [

"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",

"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",

"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",

"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",

"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",

'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',

'Opera/9.25 (Windows NT 5.1; U; en)',

'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',

'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',

'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',

'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',

"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",

"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "

]

randdom_header=random.choice(my_headers)

req = urllib.request.Request(url)

req.add_header("User-Agent",randdom_header)

req.add_header("GET",url)

try:

response = urllib.request.urlopen(req).read()

except HTTPError as e:

print(e.code,'不能打开网页，地址是：',url)

bsobj = 1

return bsobj

except urllib.error.URLError as e:

print(e.reason)

if isinstance(e.reason,socket.timeout):

print('链接超时：',url)

bsobj = 1

return bsobj

except Exception as e:

page = e.partial

bsobj = page.decode(str(chardet.detect(response)['encoding']),"ignore")

html = response.decode(str(chardet.detect(response)['encoding']),"ignore")

bsobj = BeautifulSoup(html)

return bsobj

#写入表格(s[0],s[1],s[2],s[3],s[4],s[5],s[6],s[7])

#保存当前文件名的excel文件

def file_write(a,b='',c='',d='',e='',f='',g=''):

name = os.path.split(__file__)[-1].split(".")[0]

dir = os.path.dirname(__file__)

if os.path.exists(dir+'/'+'{}.xlsx'.format(name)):#判断当前*.xlsx文件，没有则创建一个

print('{}文件存在于目录：{}'.format(name,dir))

else:

wb = openpyxl.Workbook()

ws = wb.active

wb.save(dir+'/'+'{}.xlsx'.format(name))

#打开表格，正常操作

wb=load_workbook(dir+'/'+'{}.xlsx'.format(name))

ws=wb.active

print('载入成功')

bool_value=True

for i in ws['b']: # B列去重

if str(i.value)==str(b):

bool_value=False

print('有重复标题')

if bool_value:

print('写入表格')

ws.append([a,b,c,d,e,f,g])

wb.save(dir+'/'+'{}.xlsx'.format(name))

countylist = [\

"西安","西咸","高新","新城","莲湖","碑林","未央","雁塔","灞桥","阎良","长安","临潼","高陵","鄠邑","蓝田","周至",

"宝鸡","金台","渭滨","陈仓","陇县","千阳","凤翔","麟游","岐山","扶风","眉县","凤县","太白",

"咸阳","秦都","渭城","长武","淳化","旬邑","乾县","永寿","武功","三原","礼泉","泾阳","兴平","彬州",

"铜川","耀州","王益","印台","宜君",

"渭南","临渭","华州","大荔","合阳","白水","澄城","富平","蒲城","潼关","华阴",

"韩城",

"延安","宝塔","安塞","延长","延川","子长","志丹","富县","黄陵","黄龙","洛川","吴起","甘泉","宜川",

"榆林","榆阳","横山","府谷","佳县","米脂","清涧","靖边","定边","绥德","吴堡","子洲","神木",

"汉中","汉台","南郑","城固","洋县","西乡","宁强","略阳","镇巴","留坝","佛坪",

"安康","汉滨","宁陕","石泉","汉阴","旬阳","白河","紫阳","岚皋","平利","镇坪",

"商洛","商州","柞水","镇安","山阳","洛南","丹凤","商南",

"杨陵"]

def fuping(txt):

if txt.count('贫'):return True

if txt.count('丰收'):return True

if txt.count('复工'):return True

if txt.count('金山'):return True

if txt.count('帮扶'):return True

if txt.count('粮'):return True

if txt.count('青山'):return True

if txt.count('沙'):return True

if txt.count('疫'):return True

if txt.count('农民'):return True

if txt.count('小麦'):return True

if txt.count('夏收'):return True

if txt.count('农民'):return True

if txt.count('苏陕'):return True

if txt.count('好日子'):return True

return False

# 输入文本，输出市、县

def xianshi(text):

area = len(countylist) - 1

x = 0

cityname = ""

countyname = ""

text = text.replace('陕西','')

while x < len(text):

area = 0

while area < len(countylist)-1:

if text[x:x+2] == countylist[area]:

countyname = countylist[area]

if -1 < area <16 :

cityname = countylist[0]#西安

if 15 < area <29 :

cityname = countylist[16]#宝鸡

if 28 < area <43 :

cityname = countylist[29]#咸阳

if 42 < area <48 :

cityname = countylist[43]#铜川

if 47 < area <60:

cityname = countylist[48]#渭南

if area == 59 :

cityname = countylist[59]#韩城

if 59 < area <74:

cityname = countylist[60]#延安

if 73 < area <87 :

cityname = countylist[74]#榆林

if 86 < area <98 :

cityname = countylist[87]#汉中

if 97 < area <109 :

cityname = countylist[98]#安康

if 109 < area <117 :

cityname = countylist[109]#商洛

if area == 117 :

cityname = countylist[117]#杨陵

return cityname,countyname

if cityname != "":

break

area += 1

x += 1

return cityname,countyname

def isElementPresent(browser,path):

browser.implicitly_wait(1)

try:

browser.find_element_by_xpath(path)

# element = browser.find_element_by_xpath(path)# 原文是except NoSuchElementException, e:

except NoSuchElementException as e:

print('没有发现标签:',path)

return False # 发生了NoSuchElementException异常，说明页面中未找到该元素，返回False

else:

print('发现此标签:',path)

return True # 没有发生异常，表示在页面中找到了该元素，返回True

def shiping(url):

#视频版

browser = webdriver.Chrome()

browser.implicitly_wait(5) # 隐性等待，最长等30秒

browser.get(url)

objecta = BeautifulSoup(browser.page_source)

time.sleep(1)

for i in range(1,20):

print('打开第{}页++++++'.format(i))

for x in objecta.findAll(class_="jvedio"):

s = [0,1,2,3,4,5,6]

biaoti = x.find('a').attrs['alt']

if biaoti.count('陕西新闻联播'):

s[0] = '陕西广播电视台'

else:

s[0] = '中央电视台'

s[1] = biaoti[:-19]

s[2] = biaoti[-19:-9].replace('-','/')

s[3] = x.find('span').get_text()

s[4] = x.find('a').attrs['lanmu1']

s[5] = xianshi(biaoti)[0]

s[6] = xianshi(biaoti)[1]

if fuping(biaoti):

print(s)

file_write(s[0],s[1],s[2],s[3],s[4],s[5],s[6])

browser.find_element_by_link_text('下一页').click()

time.sleep(0.5)

browser.switch_to_window(browser.window_handles[-1])

objecta = BeautifulSoup(browser.page_source)

time.sleep(1)

# 网页版

browser.find_element_by_xpath('//*[@id="web"]').click()

time.sleep(1)

browser.switch_to_window(browser.window_handles[-1])

objecta = BeautifulSoup(browser.page_source)

for m in range(1,20):

print('打开第{}页++++++'.format(m))

for x in objecta.findAll(class_="tright"):

s = [0,1,2,3,4,5,6]

s[0] = '央视网'

s[1] = x.find('a').get_text()

s[2] = x.find(class_="tim").get_text()[-19:-9].replace('-','/')

s[3] = x.find(class_="src").get_text()

s[4] = x.find('span').attrs['lanmu1']

s[5] = xianshi(x.get_text())[0]

s[6] = xianshi(x.get_text())[1]

if fuping(x.get_text()):

print(s)

file_write(s[0],s[1],s[2],s[3],s[4],s[5],s[6])

browser.find_element_by_link_text('下一页').click()

time.sleep(0.5)

browser.switch_to_window(browser.window_handles[-1])

objecta = BeautifulSoup(browser.page_source)

time.sleep(1)

browser.quit()

shiping(url)

posted @ 2020-07-14 14:55 幽见〆南山阅读(83) 评论(0) 编辑收藏举报

刷新页面返回顶部

幽见〆南山

个人博客

selenium抓取

公告