python——利用python通过浏览器打开博客页面
自动打开浏览器,并打开指定的网页内容
PS:本程序缺陷, 只能打开一页的博客内容,翻页内容待扩展
# coding:utf-8
import webbrowser as web
import os
import time
import random
import urllib2
from bs4 import BeautifulSoup
def getPage(): #伪装成浏览器登陆,获取网页源代码
url = 'http://blog.csdn.net/qiqiyingse?viewmode=contents'
baseurl='http://blog.csdn.net'
urlList=[]
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(url=url,headers=headers)
try:
html = urllib2.urlopen(req).read()
except urllib2.HTTPError,e:
print e.code
print e.reason
#将网页内容传给BeautifulSoup解析
page = BeautifulSoup(html,'lxml')
items = page.find_all('div',class_ ='list_item list_view')#找到每一个文章item
for item in items:
content=item.find('a')
url=content.get('href')#找到每一个文章的连接
url=baseurl+url#拼接成可访问的地址
urlList.append(url)
return urlList
urls=getPage()#获取到博客地址的url列表
i=0;
while i<len(urls):#根据文章列表的url决定循环次数,也是打开的网页个数
webs.open_new_tab(urls[i])#通过默认浏览器打开网页
i=i+1
time.sleep(1)#打开后,暂停1s,防止电脑卡死
else:
time.sleep(2)
os.system('taskkill /f /I.M Chrome.exe')#全部打开一遍之后,将浏览器关闭,防止进程过多,电脑死掉
print u"close broswer %d times "%(j)
修改一下代码
实现如下功能:
统计一下个人名下博客文章的数量,并将其按照访问顺序从小到大排序
#coding:utf-8
import urllib2,re,time,random,os
from bs4 import BeautifulSoup
import webbrowser as web
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def getPage(): #伪装成浏览器登陆,获取网页源代码
url = 'http://blog.csdn.net/qiqiyingse?viewmode=contents'
baseurl='http://blog.csdn.net'
contentList=[]
sortlist=[]
sortlist1=[]
urlList=[]
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(url=url,headers=headers)
try:
html = urllib2.urlopen(req).read()
except urllib2.HTTPError,e:
print e.code
print e.reason
fd=open('counter.txt','w')
page = BeautifulSoup(html,'lxml')
items = page.find_all('div',class_ ='list_item list_view')
print u'总共有文章%d 篇' % len(items)
for item in items:
aa={}
content=item.find('a')
contemtUrl=baseurl+content.get('href')
#print contemtUrl
read_time=item.find('span',class_ ='link_view')
readtime=str(read_time.text.strip())
#print readtime
readtimeNumber = int(filter(str.isdigit, readtime))
#print readtimeNumber
sortlist1.append(readtimeNumber)
#time.sleep(2)
aa['indexs']=readtimeNumber
aa['content']=content.text.strip()
aa['read_time']=readtime
aa['contemtUrl']=contemtUrl
sortlist.append(aa)
sortlist1.sort()
print sortlist1
for i in sortlist1:
for a in sortlist:
if int(i) == int(a['indexs']):
totalcontent=a['content']+'\t'+a['read_time']+'\t'+a['contemtUrl']
print totalcontent
fd.write(totalcontent)
fd.write('\n')
urlList.append(a['contemtUrl'])
contentList.append(totalcontent)
fd.close()
return urlList
urls=getPage()
下面的是一个小功能
count=random.randint(3,10)
j=0
while j< count:
for i in range(5):
web.open_new_tb(urls[i])
time.sleep(1)
os.system('taskkill /f /IM Chrome.exe')
j = j+1
增加一版python3 版本的内容:
#coding:utf-8
import urllib,re,time,random,os
import urllib.request as urllib2
from bs4 import BeautifulSoup
import webbrowser as web
import sys
from functools import cmp_to_key
def getPage(): #伪装成浏览器登陆,获取网页源代码
url = 'https://blog.csdn.net/qiqiyingse?viewmode=contents'
baseurl='http://blog.csdn.net'
contentList=[]
sortlist=[]
sortlist1=[]
urlList=[]
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(url=url,headers=headers)
try:
html = urllib2.urlopen(req).read()
except urllib2.HTTPError as e:
print (e.code)
print (e.reason )
fd=open('counter.txt','w')
page = BeautifulSoup(html,'lxml')
items = page.find_all('div',class_ ='article-item-box csdn-tracking-statistics')
print ('总共有文章%d 篇' % len(items))
for item in items:
aa={}
content=item.find('h4').find('a')
contemtUrl=content.get('href')
#print (contemtUrl)
content=content.text.strip()
#print (content)
read_time_content=item.find("div",class_="info-box d-flex align-content-center")
#read_time=read_time_content.find('span',class_='read_num')
#readtime=str(read_time.text.strip())
read_time=read_time_content.find('span',class_='read-num')
readtime=str(read_time.text.strip())
# (read_time)
readtimeNumber =int(re.sub("\D","",readtime))
#print (readtimeNumber)
sortlist1.append(readtimeNumber)
#time.sleep(2)
aa['indexs']=readtimeNumber
aa['content']=content
aa['read_time']=readtime
aa['contemtUrl']=contemtUrl
sortlist.append(aa)
sortlist1.sort()
print (sortlist1)
sortlist = sorted(sortlist,key=cmp_to_key(lambda x,y:x['indexs']-y['indexs']) )
for i in sortlist:
totalcontent=i['content']+'\t'+i['read_time']+'\t'+i['contemtUrl']
#print (totalcontent )
fd.write(totalcontent)
fd.write('\n')
urlList.append(i['contemtUrl'])
contentList.append(totalcontent)
'''
sortlist1.sort()
print (sortlist1)
for i in sortlist1:
for a in sortlist:
if int(i) == int(a['indexs']):
totalcontent=a['content']+'\t'+a['read_time']+'\t'+a['contemtUrl']
print (totalcontent )
fd.write(totalcontent)
fd.write('\n')
urlList.append(a['contemtUrl'])
contentList.append(totalcontent)
'''
fd.close()
return urlList
urls=getPage()
count=random.randint(1,3)
j=0
while j< count:
for i in range(5):
web.open_new_tab(urls[i])
time.sleep(1)
#os.system('taskkill /f /IM Chrome.exe')
os.system('taskkill /f /IM SogouExplorer.exe')
j = j+1